diff --git a/.Rprofile b/.Rprofile index 9454c40..652962d 100644 --- a/.Rprofile +++ b/.Rprofile @@ -1,8 +1,8 @@ ## --- Custom R repositories (CRAN + R-universe) ------------------------ options(repos = c( - pecanproject = "https://pecanproject.r-universe.dev", - ajlyons = "https://ajlyons.r-universe.dev", # caladaptr - CRAN = "https://cloud.r-project.org" + pecanproject = "https://pecanproject.r-universe.dev", + ajlyons = "https://ajlyons.r-universe.dev", # caladaptr + CRAN = "https://cloud.r-project.org" )) # Sys.setenv(R_LIBS_USER = file.path( @@ -61,4 +61,4 @@ if (!interactive() && requireNamespace("knitr", quietly = TRUE)) { dpi = 144 ) } -cat("global Rprofile loaded\n") +cat("repository Rprofile loaded\n") diff --git a/.gitignore b/.gitignore index 954c56a..4d30745 100644 --- a/.gitignore +++ b/.gitignore @@ -1,33 +1,44 @@ -# R-specific files +########## Operating system / Editor ########## +.DS_Store +Thumbs.db +.vscode/ +.idea/ + +########## R / RStudio ########## .Rhistory .RData .Ruserdata .Rproj.user/ *.Rproj +Rplots.pdf -# Quarto directories -/.quarto/ +########## Quarto build outputs ########## /_site/ /_book/ -*/docs/ -!/docs/** /_freeze/ +/.quarto/ /_files/ +# Keep only top-level docs; ignore nested docs directories elsewhere +*/docs/ +!/docs/** -# VScode settings -.vscode/ - -# Knitr + R Markdown cache +########## Knitr / R Markdown caches ########## *_cache/ /cache/ *_files/ -*html - -# System files -.DS_Store -Thumbs.db +*.utf8.md +*.knit.md +*.html -# Temporary files -*.tmp +########## Logs / temp ########## *.log +*.tmp +nohup.out + +########## Deployment / misc ########## +rsconnect/ + +########## Generated figures ########## +# Do not track any rendered images; site publishing copies from _site +figures/ diff --git a/000-config.R b/000-config.R index d9d729b..5ccff91 100644 --- a/000-config.R +++ b/000-config.R @@ -1,78 +1,107 @@ ### Workflow Configuration Settings ### -# Check that we are in the correct working directory -if(!basename(here::here(getwd())) == 'downscaling') { - PEcAn.logger::logger.error("Please run this script from the 'downscaling' directory") -} - -## Global configuration settings for logging +parser <- argparse::ArgumentParser() -options( - # Print all tibble columns - tibble.width = Inf, - # Suppress readr::read_csv messages - readr.show_col_types = FALSE +## Set development vs production mode ## +# Dev mode speeds up workflows by subsetting data for testing and debugging +parser$add_argument("--production", + type = "logical", default = TRUE, + help = "Set to true for production mode, false for faster development (default: TRUE)" ) +args <- parser$parse_args() +PRODUCTION <- args$production + +# Manual override for interactive sessions +if (rlang::is_interactive()) { + PRODUCTION <- TRUE +} ## Set parallel processing options no_cores <- max(future::availableCores() - 1, 1) future::plan(future::multicore, workers = no_cores) - # **set ccmmf_dir and pecan_outdir** # Define the CCMMF directory from environment variable ccmmf_dir <- Sys.getenv("CCMMF_DIR") if (ccmmf_dir == "") { ccmmf_dir <- "/projectnb2/dietzelab/ccmmf" + if (!dir.exists(ccmmf_dir)) { + # Fallback to repository root for local development + ccmmf_dir <- here::here() + } } pecan_outdir <- file.path(ccmmf_dir, "modelout", "ccmmf_phase_2b_mixed_pfts_20250701") -# **Is this a test or production run?** -# Set to FALSE during testing and development -# -# Global switch to toggle between fast, small scale runs for development and testing -# and full-scale production runs. Works by subsetting various data objects. -PRODUCTION <- TRUE + +# PEcAn model output to be analyzed +pecan_archive_tgz <- file.path(ccmmf_dir, "lebauer_agu_2025_20251210.tgz") # **Variables to extract** # see docs/workflow_documentation.qmd for complete list of outputs -outputs_to_extract <- c( - "TotSoilCarb", - "AGB" -) - -if(!PRODUCTION) { +outputs_to_extract <- c("TotSoilCarb", "AGB") +if (!PRODUCTION) { # can subset for testing # depending on what part of the workflow you are testing - # outputs_to_extract <- outputs_to_extract[1] + outputs_to_extract <- outputs_to_extract[1] } ### Configuration Settings that can be set to default ### # Assume consistent directory structure for other directories -data_dir <- file.path(ccmmf_dir, "data") +data_dir <- file.path(ccmmf_dir, "data") raw_data_dir <- file.path(ccmmf_dir, "data_raw") cache_dir <- file.path(ccmmf_dir, "cache") -model_outdir <- file.path(pecan_outdir, "out") +model_outdir <- file.path(pecan_outdir, "out") # Misc set.seed(42) -ca_albers_crs <- 3310 - -#### Messagees #### -PEcAn.logger::logger.info("\n\n", - "##### SETTINGS SUMMARY #####\n\n", - "Running in", ifelse(PRODUCTION, "**production**", "**development**"), "mode\n\n", - "### Directory Settings ###\n", - "- CCMMF directory:", ccmmf_dir, "\n", - "- data_dir :", data_dir, "\n", - "- cache_dir :", cache_dir, "\n", - "- raw_data_dir. :", raw_data_dir, "\n", - "- pecan_outdir. :", pecan_outdir, "\n", - "- model_outdir. :", model_outdir, "\n\n", - "### Other Settings ###\n", - "- will extract variables:", paste(outputs_to_extract, collapse = ", "), "\n", - "- ca_albers_crs :", ca_albers_crs, - ifelse(ca_albers_crs == 3310, ", which is NAD83 / California Albers", ""), "\n", - wrap = FALSE - ) +ca_albers_crs <- "EPSG:3310" + +if (terra::is.lonlat(ca_albers_crs)) { + PEcAn.logger::logger.severe("`ca_albers_crs` must remain a projected CRS (EPSG:3310).") +} + +ca_albers_info <- terra::crs(ca_albers_crs, describe = TRUE) +if (is.null(ca_albers_info$code) || ca_albers_info$code != 3310) { + PEcAn.logger::logger.severe("`ca_albers_crs` is reserved for California's standard NAD83 / California Albers (EPSG:3310).") +} +ca_albers_name <- ca_albers_info[["name"]] + +#### Messages #### +## Ensure pecan_archive_force is defined (allow override via env var) +if (!exists("pecan_archive_force", inherits = FALSE)) { + pecan_archive_force <- isTRUE(as.logical(Sys.getenv("PECAN_ARCHIVE_FORCE", "FALSE"))) +} + +msg <- glue::glue( + "\n\n", + "##### SETTINGS SUMMARY #####\n\n", + "Running in {ifelse(PRODUCTION, '**PRODUCTION**', '**DEVELOPMENT**')} mode\n\n", + "### Directory Settings ###\n", + "- CCMMF directory: {ccmmf_dir}\n", + "- data_dir : {data_dir}\n", + "- cache_dir : {cache_dir}\n", + "- raw_data_dir. : {raw_data_dir}\n", + "- pecan_outdir. : {pecan_outdir}\n", + "- model_outdir. : {model_outdir}\n\n", + "- pecan_archive_tgz : {ifelse(is.na(pecan_archive_tgz), '', pecan_archive_tgz)}\n", + "- pecan_archive_force : {pecan_archive_force}\n\n", + "### Other Settings ###\n", + "- will extract variables: {paste(outputs_to_extract, collapse = ', ')}\n", + "- ca_albers_crs : {ca_albers_crs}{if(!is.na(ca_albers_name)) paste0(' (', ca_albers_name, ')') else ''}\n" +) +if (!isTRUE(getOption("ccmmf.quiet_banner", FALSE))) { + PEcAn.logger::logger.info(msg, wrap = FALSE) +} + +## Global configuration settings for logging + +options( + # Print all tibble columns + tibble.width = Inf, + # Suppress readr::read_csv messages + readr.show_col_types = FALSE +) +## Source all R scripts in the R/ directory +r_scripts <- list.files(file.path(here::here(), "R"), pattern = "\\.R$", full.names = TRUE) +lapply(r_scripts, source) diff --git a/R/combine_mixed_crops.R b/R/combine_mixed_crops.R new file mode 100644 index 0000000..5afea59 --- /dev/null +++ b/R/combine_mixed_crops.R @@ -0,0 +1,129 @@ +#' Combine two-PFT outputs to represent mixed cropping systems +#' +#' Rules for combining woody and annual PFT outputs (stocks or accumulated +#' flux totals) to represent a mixed cropping system. Supports two methods: +#' - "weighted": PFTs partition area (woody_cover + annual_cover = 1) and the +#' output is a weighted average: `woody_cover * woody_value + annual_cover * annual_value`. +#' - "incremental": preserves the full-area woody baseline (requires `woody_cover == 1`) +#' and treats annual as an increment relative to an annual initial baseline: +#' `annual_delta = annual_value - annual_init`; `result = woody_value + annual_cover * annual_delta`. +#' +#' All inputs must be vectors each of length 1 or a shared common length. +#' +#' This function is intended to be applied to pools or integrated fluxes (e.g., annual NPP, annual N2O flux). +#' +#' Validation rules (severe errors unless otherwise noted): +#' * No input values may be NA (including covers, pool sizes, annual_init if required) +#' * Covers must lie within [0,1] +#' * Method "incremental": `woody_cover` must be 1 (within 0.1%); if not, severe error +#' * Method "incremental": `annual_init` required +#' * Method "weighted": rows whose `woody_cover + annual_cover` differ from 1 by more than tolerance +#' are set to NA in the result; a single aggregated warning is emitted listing the count +#' +#' Inputs may be any quantity expressed per unit area (stocks such as +#' kg/m2 or fluxes accumulated over a defined time step, e.g., kg/m2 per +#' hour or year). +#' +#' @param woody_value numeric. Pool or accumulated flux for the woody PFT (kg/m2). +#' @param annual_value numeric. Pool or accumulated flux for the annual PFT (kg/m2). +#' @param annual_init numeric, required for method = "incremental"; the initial annual pool +#' size at t0 (kg C m-2). Used to compute the delta: (annual_value - annual_init). +#' @param annual_cover numeric. Fractional cover of the annual PFT (0-1). +#' @param woody_cover numeric. Fractional cover of the woody PFT (0-1). Must be 1 when `method` is "incremental". +#' @param method character. One of "weighted" or "incremental". +#' +#' @return numeric vector of combined values. +#' +#' @examples +#' # Discrete mixing (weights sum to 1) +#' combine_mixed_crops( +#' woody_value = 100, annual_value = 50, +#' annual_cover = 0.2, woody_cover = 0.8, method = "weighted" +#' ) +#' +#' # Overlap: preserve woody baseline (woody_cover==1), add annual increment scaled by cover +#' combine_mixed_crops( +#' woody_value = 200, annual_value = 220, annual_init = 200, +#' annual_cover = 0.3, woody_cover = 1.0, method = "incremental" +#' ) +combine_mixed_crops <- function(woody_value, + annual_value, + annual_cover, + woody_cover, + annual_init = NULL, + method = c("weighted", "incremental")) { + method <- match.arg(method) + + # Internal tolerance for floating point comparisons + tol <- 1e-3 + + # Accept scalars for cover and vectors for values + recycled <- vctrs::vec_recycle_common( + woody_value = woody_value, + annual_value = annual_value, + annual_cover = annual_cover, + woody_cover = woody_cover, + annual_init = annual_init, + .size = vctrs::vec_size_common( + woody_value, + annual_value, + annual_cover, + woody_cover, + annual_init + ) + ) + + woody_value <- recycled$woody_value + annual_value <- recycled$annual_value + annual_cover <- recycled$annual_cover + woody_cover <- recycled$woody_cover + annual_init <- recycled$annual_init + + # NA checks (annual_init only if required for incremental) + na_checks <- list( + woody_value = woody_value, + annual_value = annual_value, + annual_cover = annual_cover, + woody_cover = woody_cover + ) + if (method == "incremental") { + if (is.null(annual_init)) { + PEcAn.logger::logger.severe("incremental: annual_init is required but missing.") + } + na_checks$annual_init <- annual_init + } + na_found <- vapply(names(na_checks), function(nm) anyNA(na_checks[[nm]]), logical(1)) + if (any(na_found)) { + fields <- paste(names(na_found)[na_found], collapse = ", ") + PEcAn.logger::logger.severe(paste0("NA values not allowed in inputs: ", fields)) + } + + # Range checks for covers + out_of_range_annual <- (annual_cover < -tol) | (annual_cover > 1 + tol) + out_of_range_woody <- (woody_cover < -tol) | (woody_cover > 1 + tol) + if (any(out_of_range_annual | out_of_range_woody, na.rm = TRUE)) { + n_bad <- sum(out_of_range_annual | out_of_range_woody, na.rm = TRUE) + PEcAn.logger::logger.severe("weighted: cover fractions must be in the range [0,1] (+/- tol). ", n_bad, " rows violate this constraint.") + } + + if (method == "incremental") { + not_one <- abs(woody_cover - 1) > tol + if (any(not_one, na.rm = TRUE)) { + n_bad <- sum(not_one, na.rm = TRUE) + PEcAn.logger::logger.severe("incremental: woody_cover must be 1 (+/- ", tol, "); ", n_bad, " rows violate.") + } + res <- woody_value + annual_cover * (annual_value - annual_init) + return(res) + } + + # weighted method + sum_cov <- woody_cover + annual_cover + bad_sum <- abs(sum_cov - 1) > tol | is.na(sum_cov) + res <- woody_cover * woody_value + annual_cover * annual_value + if (any(bad_sum, na.rm = TRUE)) { + n_bad <- sum(bad_sum, na.rm = TRUE) + PEcAn.logger::logger.warn(paste0("weighted: ", n_bad, " rows with cover fractions not summing to 1 (tol); results set to NA for those rows.")) + res[bad_sum] <- NA_real_ + } + return(res) +} diff --git a/R/ggsave_optimized.R b/R/ggsave_optimized.R new file mode 100644 index 0000000..0b4f086 --- /dev/null +++ b/R/ggsave_optimized.R @@ -0,0 +1,138 @@ +#' Save Web Optimized Image Formats +#' +#' Saves a ggplot object to a file, supporting both vector and raster formats. +#' Raster formats are processed with `magick` for additional optimizations. +#' +#' @param filename Character. Path to save the plot, including file extension. +#' @param plot ggplot object. The plot to save. Defaults to the last plot. +#' @param width Numeric. Width of the plot in specified units. Default is 7. +#' @param height Numeric. Height of the plot in specified units. Default is 5. +#' @param units Character. Units for width and height ("in", "cm", "mm"). Default is "in". +#' @param dpi Numeric. Resolution for raster formats. Default is 96. +#' @param quality Numeric. JPEG/WebP quality (1-100). Default is 80. +#' @param bg Character. Background color. Default is "white". +#' @param use_cairo Logical. Use Cairo/svglite for vector devices if available. Default TRUE. +#' @param use_ragg Logical. Use ragg devices for raster if available. Default TRUE. +#' @param png_quantize Logical. Quantize PNG colors via magick to reduce size. Default TRUE. +#' @param png_max Integer. Max colors for PNG quantization. Default 256. +#' @param webp_lossless Logical. Encode WebP losslessly (overrides quality). Default FALSE. +#' @param ... Additional arguments passed to `ggplot2::ggsave`. +#' +#' @return Invisibly returns the filename of the saved plot. +#' @examples +#' \dontrun{ +#' ggsave_optimized("plot.png", plot = my_plot, width = 8, height = 6) +#' } +#' @export +ggsave_optimized <- function( + filename, + plot = ggplot2::last_plot(), + width = 7, + height = 5, + units = "in", + dpi = 96, + quality = 80, + bg = "white", + use_cairo = TRUE, + use_ragg = TRUE, + png_quantize = TRUE, + png_max = 256, + webp_lossless = FALSE, + ...) { + ext <- tolower(tools::file_ext(filename)) + # Choose device where applicable + dev <- NULL + if (ext %in% c("pdf", "svg", "eps")) { + if (isTRUE(use_cairo) && isTRUE(grDevices::capabilities()["cairo"])) { + if (ext == "pdf") dev <- grDevices::cairo_pdf + if (ext == "svg") { + if (requireNamespace("svglite", quietly = TRUE)) { + dev <- svglite::svglite + } else { + dev <- grDevices::svg + } + } + if (ext == "eps") dev <- grDevices::cairo_ps + } else { + if (ext == "svg") dev <- grDevices::svg + } + } else if (ext %in% c("png", "jpg", "jpeg")) { + if (isTRUE(use_ragg) && requireNamespace("ragg", quietly = TRUE)) { + if (ext == "png") dev <- ragg::agg_png + if (ext %in% c("jpg", "jpeg")) dev <- ragg::agg_jpeg + } + } + + # Use ggsave for vector formats and raster formats other than WebP + if (ext %in% c("svg", "pdf", "eps", "png", "jpg", "jpeg")) { + ggplot2::ggsave( + filename = filename, + plot = plot, + width = width, + height = height, + units = units, + dpi = dpi, + bg = bg, + device = dev, + limitsize = FALSE, + ... + ) + # Optimize PNGs with magick + if (ext == "png") { + if (isTRUE(png_quantize)) { + img <- magick::image_read(filename) + img <- magick::image_quantize(img, max = png_max) + magick::image_write(img, path = filename, format = "png", compression_level = 9) + } + } + return(invisible(filename)) + } + + # Use magick for WebP (not natively supported by ggsave) + if (ext == "webp") { + tmp <- withr::local_tempfile(fileext = ".png") + ggplot2::ggsave( + filename = tmp, + plot = plot, + width = width, + height = height, + units = units, + dpi = dpi, + bg = bg, + device = if (isTRUE(use_ragg) && requireNamespace("ragg", quietly = TRUE)) ragg::agg_png else NULL, + limitsize = FALSE, + ... + ) + img <- magick::image_read(tmp) + if (isTRUE(webp_lossless)) { + magick::image_write(img, path = filename, format = "webp", quality = 100) + } else { + magick::image_write(img, path = filename, format = "webp", quality = quality) + } + unlink(tmp) + return(invisible(filename)) + } + + # Support TIFF via magick + if (ext %in% c("tif", "tiff")) { + tmp <- withr::local_tempfile(fileext = ".png") + ggplot2::ggsave( + filename = tmp, + plot = plot, + width = width, + height = height, + units = units, + dpi = dpi, + bg = bg, + device = if (isTRUE(use_ragg) && requireNamespace("ragg", quietly = TRUE)) ragg::agg_png else NULL, + limitsize = FALSE, + ... + ) + img <- magick::image_read(tmp) + magick::image_write(img, path = filename, format = "tiff") + unlink(tmp) + return(invisible(filename)) + } + + stop("Unsupported file extension: ", ext) +} diff --git a/R/helper.R b/R/helper.R new file mode 100644 index 0000000..31be91d --- /dev/null +++ b/R/helper.R @@ -0,0 +1,17 @@ +# ---- helper instrumentation utilities --------------------------------------- +ts_now <- function() format(Sys.time(), "%Y-%m-%d %H:%M:%S") +step_timer <- function() list(start = Sys.time()) +step_elapsed <- function(timer) as.numeric(difftime(Sys.time(), timer$start, units = "secs")) +log_mem <- function(prefix = "") { + # Best-effort simple memory usage (RSS) on Linux; fallback to NA + rss <- tryCatch( + { + pid <- Sys.getpid() + # RSS in kB + as.numeric(system(paste("ps -o rss= -p", pid), intern = TRUE)) / 1024 + }, + error = function(e) NA_real_ + ) + PEcAn.logger::logger.info(prefix, "Memory ~", ifelse(is.na(rss), "NA", paste0(round(rss, 1), " MB"))) +} +overall_timer <- step_timer() diff --git a/R/match_site_ids_by_location.R b/R/match_site_ids_by_location.R new file mode 100644 index 0000000..810dab9 --- /dev/null +++ b/R/match_site_ids_by_location.R @@ -0,0 +1,256 @@ +#' Match site IDs by ID, then nearest location (or location only) +#' +#' By default, preserves any exact ID matches between `target_df` and `reference_df`, +#' and for targets whose IDs are not present in `reference_df`, matches the nearest +#' reference site by location. Optionally, set `prefer_location = TRUE` to ignore +#' IDs entirely and match every target to its nearest reference by location. +#' Returns one row per target with: target_site_id, matched_site_id, coordinates, +#' distance_m, and a coarse proximity class. +#' +#' @param target_df data.frame with at least id/lat/lon columns +#' @param reference_df data.frame with at least id/lat/lon columns +#' @param target_id_col character. ID column in target_df (default "site_id") +#' @param reference_id_col character. ID column in reference_df (default "site_id") +#' @param target_lat_col character. Latitude column in target_df (default "lat") +#' @param target_lon_col character. Longitude column in target_df (default "lon") +#' @param reference_lat_col character. Latitude column in reference_df (default "lat") +#' @param reference_lon_col character. Longitude column in reference_df (default "lon") +#' @param crs character. CRS of the INPUT coordinates (default "EPSG:4326"). +#' This must reflect how your `lat`/`lon` columns are expressed: +#' use a geographic CRS like "EPSG:4326" when coordinates are decimal degrees; +#' only use a projected CRS (e.g., "EPSG:3310") if your coordinates are +#' already in that projection and in linear units. This function does not +#' reproject the input; set the CRS to what it is. If you need to transform +#' coordinates, do so prior to calling (e.g., with `terra::project`). +#' @param prefer_location logical. If TRUE, ignore IDs and match all targets by +#' nearest location (location-only mode). If FALSE (default), first match +#' by ID and only compute nearest for targets whose IDs are missing. +#' @param map_all logical. If TRUE, compute nearest distances for all target rows. +#' If FALSE, only compute nearest distances for IDs missing from reference; +#' ID-matched rows are returned with distance 0. Note: `map_all` does not +#' change which ID is returned; it only controls distance calculations. +#' @param max_distance numeric. Maximum allowable distance (m) for a match; error if exceeded (default 100 m). +#' @return a tibble with one row per target row, containing: +#' \describe{ +#' \item{target_site_id}{Original target site ID} +#' \item{matched_site_id}{Matched reference site ID (may differ from target if not found by ID)} +#' \item{target_lat, target_lon}{Original target coordinates} +#' \item{ref_lat, ref_lon}{Matched reference coordinates} +#' \item{distance_m}{Distance between target and matched reference (meters)} +#' \item{close}{Proximity classification: "same location", "very close", "close", "moderate", "far"} +#' } +#' Rows are returned in the same order as target_df. + +match_site_ids_by_location <- function( + target_df, + reference_df, + target_id_col = "site_id", + reference_id_col = "site_id", + target_lat_col = "lat", + target_lon_col = "lon", + reference_lat_col = "lat", + reference_lon_col = "lon", + crs = "EPSG:4326", + prefer_location = FALSE, + map_all = FALSE, + max_distance = 100) { + # Validate columns + req_target <- c(target_id_col, target_lat_col, target_lon_col) + req_ref <- c(reference_id_col, reference_lat_col, reference_lon_col) + + if (!all(req_target %in% colnames(target_df))) { + PEcAn.logger::logger.error( + "target_df is missing required columns: ", + paste(setdiff(req_target, colnames(target_df)), collapse = ", ") + ) + } + if (!all(req_ref %in% colnames(reference_df))) { + PEcAn.logger::logger.error( + "reference_df is missing required columns: ", + paste(setdiff(req_ref, colnames(reference_df)), collapse = ", ") + ) + } + + # target_df: check for records with the same site_id but different coordinates + id_coord <- target_df |> dplyr::distinct( + dplyr::across(dplyr::all_of(c(target_id_col, target_lat_col, target_lon_col))) + ) + if (any(duplicated(id_coord[[target_id_col]]))) { + PEcAn.logger::logger.severe( + "target_df has duplicate IDs with different coordinates; ensure sites are unique." + ) + } + + # reference_df: check for records with the same site_id but different coordinates + ref_id_coord <- reference_df |> dplyr::distinct( + dplyr::across(dplyr::all_of(c(reference_id_col, reference_lat_col, reference_lon_col))) + ) + if (any(duplicated(ref_id_coord[[reference_id_col]]))) { + PEcAn.logger::logger.severe( + "reference_df has duplicate IDs with different coordinates; ensure sites are unique." + ) + } + + # Annotate rows and, unless in location-only mode, split by ID membership + by_id <- stats::setNames(reference_id_col, target_id_col) + target_df <- target_df |> + dplyr::mutate(`..row..` = dplyr::row_number()) + + matched_id <- NULL + mismatched_id <- target_df + if (!isTRUE(prefer_location)) { + matched_id <- target_df |> + dplyr::inner_join(reference_df, by = by_id, suffix = c(".t", ".r")) + + mismatched_id <- target_df |> + dplyr::anti_join(reference_df, by = by_id) + + n_needs <- nrow(mismatched_id) + if (n_needs == 0) { + PEcAn.logger::logger.info("All target IDs found in reference by ID.") + } else { + PEcAn.logger::logger.warn( + paste(n_needs, "target sites not in reference by ID; matching by nearest location.") + ) + } + } else { + PEcAn.logger::logger.info("prefer_location=TRUE: matching ALL targets to nearest reference by location (ignoring IDs).") + } + + # Compute nearest for mismatches (always) + mapping_miss <- NULL + if (nrow(mismatched_id) > 0) { + tgt_vect_miss <- terra::vect( + mismatched_id, + geom = c(target_lon_col, target_lat_col), + crs = crs + ) + ref_vect <- terra::vect( + reference_df, + geom = c(reference_lon_col, reference_lat_col), + crs = crs + ) + nearest_site <- terra::nearest(tgt_vect_miss, ref_vect) + idx <- nearest_site$to_id + dist_m <- nearest_site$distance + + mapping_miss <- tibble::tibble( + `..row..` = mismatched_id$`..row..`, + target_site_id = mismatched_id[[target_id_col]], + matched_site_id = reference_df[[reference_id_col]][idx], + target_lat = mismatched_id[[target_lat_col]], + target_lon = mismatched_id[[target_lon_col]], + ref_lat = reference_df[[reference_lat_col]][idx], + ref_lon = reference_df[[reference_lon_col]][idx], + distance_m = dist_m + ) + } + + # Build mapping for matched-by-ID rows (skipped in location-only mode) + mapping_match <- NULL + if (!isTRUE(prefer_location) && nrow(matched_id) > 0) { + # Prepare base table + mapping_match <- tibble::tibble( + `..row..` = matched_id$`..row..`, + # join key keeps one column name (no suffix) use it for both + target_site_id = matched_id[[target_id_col]], + matched_site_id = matched_id[[target_id_col]], + target_lat = matched_id[[paste0(target_lat_col, ".t")]], + target_lon = matched_id[[paste0(target_lon_col, ".t")]], + ref_lat = matched_id[[paste0(reference_lat_col, ".r")]], + ref_lon = matched_id[[paste0(reference_lon_col, ".r")]] + ) + + # Distances for matched rows + if (isTRUE(map_all)) { + tgt_pts <- terra::vect(mapping_match, geom = c("target_lon", "target_lat"), crs = crs) + ref_pts <- terra::vect(mapping_match, geom = c("ref_lon", "ref_lat"), crs = crs) + dvec <- vapply(seq_len(nrow(mapping_match)), function(i) as.numeric(terra::distance(tgt_pts[i], ref_pts[i])), numeric(1)) + mapping_match$distance_m <- dvec + } else { + mapping_match$distance_m <- 0 + } + } + + # Combine matched and mismatched, preserve original row order + mapping <- dplyr::bind_rows(mapping_match, mapping_miss) |> + dplyr::arrange(`..row..`) |> + dplyr::mutate( + close = dplyr::case_when( + distance_m <= 10 ~ "same location", + distance_m <= 100 ~ "very close (<=100m)", + distance_m <= 500 ~ "close (100-500m)", + distance_m <= 1000 ~ "moderate (500-1000m)", + distance_m <= 5000 ~ "far (1000-5000m)", + TRUE ~ "far (>5000m)" + ) + ) |> + dplyr::select(-`..row..`) + + # Enforce maximum allowable distance + if (any(mapping$distance_m > max_distance, na.rm = TRUE)) { + n_exceed <- sum(mapping$distance_m > max_distance, na.rm = TRUE) + PEcAn.logger::logger.severe( + n_exceed, " matched target rows exceed max_distance of ", max_distance, " m" + ) + } + + return(mapping) +} + +#' Update site IDs in a target data.frame by nearest reference site +#' +#' If some target IDs don't exist in reference, update them to the nearest reference IDs by location. +#' If all target IDs exist in reference, returns the original target_df unchanged. +#' +#' @inheritParams match_site_ids_by_location +#' @return data.frame with potentially updated ID column +update_site_ids_by_location <- function( + target_df, + reference_df, + id_col = "site_id", + target_lat_col = "lat", + target_lon_col = "lon", + reference_id_col = "site_id", + reference_lat_col = "lat", + reference_lon_col = "lon", + crs = "EPSG:4326", + max_distance = 100) { + mapping <- match_site_ids_by_location( + target_df = target_df, + reference_df = reference_df, + target_id_col = id_col, + reference_id_col = reference_id_col, + target_lat_col = target_lat_col, + target_lon_col = target_lon_col, + reference_lat_col = reference_lat_col, + reference_lon_col = reference_lon_col, + crs = crs, + map_all = FALSE, + max_distance = max_distance + ) + + # Replace IDs where mapping exists + tdf <- target_df + orig_id_col <- id_col + # unify id column name for join + if (orig_id_col != "site_id") { + names(tdf)[names(tdf) == orig_id_col] <- "site_id" + } + + updated <- tdf |> + dplyr::left_join( + mapping |> + dplyr::select(target_site_id, matched_site_id), + by = c("site_id" = "target_site_id") + ) |> + dplyr::mutate(site_id = dplyr::if_else(is.na(matched_site_id), site_id, matched_site_id)) |> + dplyr::select(-matched_site_id) + + # rename back to original id name if needed + if (orig_id_col != "site_id") { + names(updated)[names(updated) == "site_id"] <- orig_id_col + } + + return(updated) +} diff --git a/R/mixed_aggregation.R b/R/mixed_aggregation.R deleted file mode 100644 index 39b8ec3..0000000 --- a/R/mixed_aggregation.R +++ /dev/null @@ -1,70 +0,0 @@ -# TODO: rename function -#' Combine two-PFT outputs to represent mixed cropping systems -#' -#' Canonical rules for combining woody and annual PFT outputs. The function -#' implements two scenarios: -#' - "discrete": the woody and annual covers are assumed to partition the area -#' (woody_cover + annual_cover must equal 1) and the output is a weighted -#' average: woody_cover * woody_value + annual_cover * annual_value. -#' - "overlap": the woody baseline is preserved and the annual contribution is -#' treated as an increment relative to an initial annual baseline: -#' annual_delta = annual_value - annual_init, result = woody_value + -#' annual_cover * annual_delta. In this case annual_init must be provided. -#' -#' Variable classes currently supported include carbon stocks "AGB" and "TotSoilCarb" -#' -#' @param var character. Variable name, must be one of "AGB" or "TotSoilCarb" -#' @param woody_value numeric. Pool size for the woody PFT (kg/m2) -#' @param annual_value numeric. Pool size for the annual PFT (kg/m2) -#' @param annual_init numeric, optional. Required and only used for "overlap" -#' scenario; the initial conditions used to compute the change -#' attributed to the annual crop (annual_value - annual_init). -#' @param annual_cover numeric. Fractional cover of the annual PFT (f_annual). -#' @param woody_cover numeric. Fractional cover of the woody PFT (f_woody). -#' @param scenario character. One of "discrete" (default) or "overlap". -#' -#' @details -#' - Discrete: enforces woody_cover + annual_cover == 1 and returns a simple weighted sum. -#' - Overlap: requires annual_init; computes the incremental effect of the annual crop and -#' adds the increment scaled by annual_cover to the woody baseline. -#' -#' @return Numeric. The combined value according to the selected scenario. -#' -#' @author David LeBauer -#' @examples -#' # Discrete mixing (weights sum to 1) -#' combine_value("AGB", woody_value = 100, annual_value = 50, -#' annual_cover = 0.2, woody_cover = 0.8, scenario = "discrete") -#' -#' # Overlap: preserve woody baseline, add annual increment scaled by cover -#' combine_value("TotSoilCarb", woody_value = 200, -#' annual_value = 220, annual_init = 200, -#' annual_cover = 0.3, woody_cover = 0.9, scenario = "overlap") -combine_value <- function(woody_value, annual_value, annual_init = NULL, - annual_cover, woody_cover, - method = c("weighted", "incremental")) { - method <- match.arg(method, choices = c("weighted", "incremental")) - - # Basic NA checks for covers - if (is.na(woody_cover) || is.na(annual_cover)) { - PEcAn.logger::logger.warn("One or both cover fractions are NA; returning NA for combine_value.") - return(NA_real_) - } - - if (method == "weighted") { - sum_cov <- woody_cover + annual_cover - # exact within tolerance -> normal weighted average - if (abs(sum_cov - 1) <= 1e-8) { - return(woody_cover * woody_value + annual_cover * annual_value) - } - } - - if (method == "incremental") { - if (is.null(annual_init) || is.na(annual_init)) { - PEcAn.logger::logger.warn("incremental: annual_init is missing; returning NA.") - return(NA_real_) - } - annual_delta <- annual_value - annual_init - return(woody_value + annual_cover * annual_delta) - } -} diff --git a/README.md b/README.md index 67296fb..c4f8f6a 100644 --- a/README.md +++ b/README.md @@ -1,36 +1,24 @@ -# Downscaling Workflow -## Overview - -The downscaling workflow predicts carbon pools (SOC and AGB) for California crop fields and aggregates predictions to the county level. It uses ensemble-based uncertainty propagation via SIPNET model runs and Random Forest downscaling. - -**Key components:** -- Environmental covariate extraction from multiple sources (ERA5, SoilGrids, TWI) -- Design point selection using k-means clustering -- SIPNET model runs at design points -- Random Forest models to downscale from design points to all fields -- County-level aggregation of carbon estimates - -### Concepts - -- **Anchor sites:** Fields with ground truth validation data (e.g., Ameriflux sites) -- **Design points:** Representative fields selected for SIPNET simulation based on environmental clustering -- **Downscaling:** Process of extending predictions from design points to all California crop fields +# Downscaling And Aggregation Workflow +::: {.callout-warning} +This proof of concept is untested and subject to change. Interpret results as illustrative. +::: -## Documentation - -- [Detailed Workflow Documentation](docs/workflow_documentation.qmd) - Step-by-step description of methods -- [Results and Analysis](reports/downscaling_results.qmd) - Visualizations and interpretation +## Overview -## Environment and Setup +This workflow estimates carbon pools (SOC and AGB) for California crop fields and aggregates to the county level. +Key components: -### Key Configuration Files +- Environmental covariates (ERA5, SoilGrids, TWI) +- Design point selection via k-means +- SIPNET simulations at design points [done externally] +- Random Forest downscaling to all fields +- County-level aggregation -- `.future.R` - Sets up parallel processing -- `000-config.R` - Configuration file for the workflow (called in each workflow script) +Configuration: see `000-config.R` for paths, variables, and parallel settings. ## Quick start @@ -38,67 +26,36 @@ The downscaling workflow predicts carbon pools (SOC and AGB) for California crop # Load modules (geo cluster example) module load R/4.4.0 gdal proj geos sqlite udunits quarto -# Decide where the shared CCMMF area lives (or set in .Renviron) -export CCMMF_DATA_DIR=/projectnb/dietzelab/ccmmf # or $HOME/ccmmf-dev +# Point to the shared CCMMF directory (or set in .Renviron) +export CCMMF_DIR=/projectnb/dietzelab/ccmmf # or $HOME/ccmmf-dev -git clone https://github.com/ccmmf/downscale.git -cd downscale +git clone https://github.com/ccmmf/downscaling.git +cd downscaling # Restore exact packages for this workflow R -e 'if (!requireNamespace("renv", quietly = TRUE)) install.packages("renv"); renv::restore()' - -# Run the pipeline [Not implemented] -# R -e 'targets::tar_make()' ``` -## Running the Workflow +### Run Sequence -The workflow consists of these main scripts: +See full details about how to set up and run the workflows in the [Technical Documentation](docs/workflow_documentation.md#sec-tech-doc). ```bash -# 1. Prepare environmental covariates +# Data prep and clustering Rscript scripts/010_prepare_covariates.R - -# 2. Assign anchor sites to fields Rscript scripts/011_prepare_anchor_sites.R - -# 3. Select design points via clustering Rscript scripts/020_cluster_and_select_design_points.R - -# 4. Cluster diagnostics and visualization Rscript scripts/021_clustering_diagnostics.R -# 5. Extract SIPNET output from model runs -Rscript scripts/030_extract_sipnet_output.R - -# 6. Downscale and aggregate to county level -Rscript scripts/040_downscale_and_aggregate.R - -# 7. Downscale analysis and interpretation -Rscript scripts/041_downscale_analysis.R - -# 8. Generate results documentation -quarto render reports/downscaling_results.qmd -``` - -## Advanced Setup and Use - -### Interactive Session on BU Cluster - -```sh -# On geo.bu.edu: -qrsh -l h_rt=3:00:00 -pe omp 16 -l buyin -``` - -### Job Submission +# Extract SIPNET outputs and create mixed-PFT scenarios +Rscript scripts/030_extract_sipnet_output.R +Rscript scripts/031_aggregate_sipnet_output.R -This is an example of how a script can be run on an HPC +# Downscale and aggregate +Rscript scripts/040_downscale.R +Rscript scripts/041_aggregate_to_county.R -```sh -qsub \ - -l h_rt=1:00:00 \ - -pe omp 8 \ - -o logs/03.out \ - -e logs/03.err \ - -b y Rscript downscale/999_workflow_step.R +# Analysis and figures +Rscript scripts/042_downscale_analysis.R +Rscript scripts/043_county_level_plots.R ``` diff --git a/_quarto.yml b/_quarto.yml new file mode 100644 index 0000000..d702238 --- /dev/null +++ b/_quarto.yml @@ -0,0 +1,62 @@ +project: + type: website + output-dir: _site + preview: + watch-inputs: true + resources: + - README.md + render: + # Explicit pages to render + - index.qmd # renders README + - reports/downscaling_results.qmd + - reports/design_points_analysis.qmd + - reports/variable_importance.qmd + - docs/workflow_documentation.md + - docs/references.md + - docs/mixed_system_prototype.qmd + +website: + title: "Downscaling" + search: true + sidebar: + style: docked + collapse-level: 1 + contents: + - href: index.qmd + text: Home + - section: Reports + contents: + - href: reports/downscaling_results.qmd + text: Downscaling Results + - href: reports/design_points_analysis.qmd + text: Design Points Analysis + - href: reports/variable_importance.qmd + text: Downscaling Model Evaluation + - section: Docs + contents: + - href: docs/workflow_documentation.md + text: Technical Documentation + - href: docs/mixed_system_prototype.qmd + text: Mixed System Prototype + - href: docs/references.md + text: References + tools: + - icon: exclamation-octagon + text: "Status: Draft" + - icon: github + href: https://github.com/ccmmf/downscaling + +format: + html: + theme: cosmo + toc: true + self-contained: false + df-print: paged + +# Freeze results so CI can publish without re-executing heavy code. +execute: + echo: false + freeze: auto + +author: "David LeBauer" +date: today diff --git a/docs/mixed_system_prototype.qmd b/docs/mixed_system_prototype.qmd index 240d968..e0ed128 100644 --- a/docs/mixed_system_prototype.qmd +++ b/docs/mixed_system_prototype.qmd @@ -1,11 +1,12 @@ --- title: "Mixed-System Prototype (Two-PFT Aggregation)" author: "David LeBauer" -date: "`r Sys.Date()`" +date: today quarto-cache: true format: html: - self-contained: true + self-contained: false + df-print: default execute: cache: true echo: false @@ -13,7 +14,7 @@ execute: message: false --- -This document prototypes a workflow for modeling croplands with multiple crops ("mixed cropping systems"). +This document presents a prototype workflow for modeling croplands with multiple crops ("mixed cropping systems"). ## Challenge @@ -47,7 +48,7 @@ Technical details of these steps are described in more detail below. - Use the same meteorological driver data (ERA5 ensemble). - Use the same soil texture and soil organic carbon. 3. Settings File: - - For each mixed site, create separate a separate configuration file for each PFT; append `-PFT` to the site ID. + - For each mixed site, create a separate configuration file for each PFT; append `-PFT` to the site ID. 4. For herbaceous crops: - Annual herbaceous: assume start with bare ground (zero biomass). - Perennial herbaceous: start as bare ground run; consider methods for estimating initial biomass in later iterations. @@ -76,6 +77,9 @@ Technical details of these steps are described in more detail below. ### Mixture Methods {#sec-mixture-methods} +These formulas are implemented in `combine_mixed_crops()`. This function expects additive quantities +per unit area (stocks or flux totals already integrated over a defined time step. + #### Two Cases for Mixed Cropping Systems: Discrete and Overlap In a mixed cropping system, we define $f_{woody}$ and $f_{annual}$ as the percent contribution of a crop or plant functional type to ecosystem dynamics. This contribution is not "canopy cover" since that changes over time. Think of this as "percent of a monoculture". This method will build on SIPNET's single PFT design - the outputs from two separate single PFT runs simulate will be combined. @@ -149,8 +153,8 @@ These simulations were run from 2016 through 2024 across 10 sites, with an ensem library(tidyverse) library(lubridate) library(here) -library(stringr) # for label wrapping -library(grid) # grid::unit() for panel spacing +library(stringr) # for label wrapping +library(grid) # grid::unit() for panel spacing source(here::here("000-config.R")) if (!dir.exists(model_outdir)) { @@ -165,33 +169,37 @@ mix_label_fn <- function(x) stringr::str_replace_all(x, " \\+ ", "\n+ ") # labeller that applies the above to mix_description (fallback to a normal wrap for other strips) mix_labeller <- labeller( - mix_description = mix_label_fn, - .default = label_wrap_gen(width = 12) + mix_description = mix_label_fn, + .default = label_wrap_gen(width = 12) ) pal_mix <- c( - monoculture = "#1b9e77", - discrete = "#d95f02", - overlap = "#7570b3" + monoculture = "#1b9e77", + discrete = "#d95f02", + overlap = "#7570b3" ) base_theme <- theme_minimal(base_size = 10) + - theme( - legend.position = "top", - strip.text = element_text(size = 9, lineheight = 0.95, - margin = margin(2, 2, 2, 2)), - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.margin = margin(5, 5, 5, 5), - plot.title.position = "plot", - # center title and subtitle - plot.title = element_text(hjust = 0.5), - plot.subtitle = element_text(hjust = 0.5) - ) + theme( + legend.position = "top", + strip.text = element_text( + size = 9, lineheight = 0.95, + margin = margin(2, 2, 2, 2) + ), + axis.text.x = element_text(angle = 0, hjust = 0.5), + plot.margin = margin(5, 5, 5, 5), + plot.title.position = "plot", + # center title and subtitle + plot.title = element_text(hjust = 0.5), + plot.subtitle = element_text(hjust = 0.5) + ) wrap_labeller <- labeller(.default = label_wrap_gen(width = 12)) -year_scale_dt <- scale_x_datetime(date_breaks = "1 year", - date_labels = "%Y", - expand = expansion(mult = c(0.01, 0.02))) +year_scale_dt <- scale_x_datetime( + date_breaks = "1 year", + date_labels = "%Y", + expand = expansion(mult = c(0.01, 0.02)) +) ``` ## Mixed-system dataset @@ -222,12 +230,12 @@ combined_output <- readr::read_csv( ) ), mix_type = case_when( - mix_description %in% c("100% annual", "100% woody") ~ "monoculture", - scenario == "discrete" ~ "discrete", - scenario == "overlap" ~ "overlap", - TRUE ~ "other" - ) - ) |> + mix_description %in% c("100% annual", "100% woody") ~ "monoculture", + scenario == "discrete" ~ "discrete", + scenario == "overlap" ~ "overlap", + TRUE ~ "other" + ) + ) |> arrange(mix_description) # variables present (used by time-series sections below) @@ -235,51 +243,122 @@ vars <- unique(combined_output$variable) ``` +## County Comparisons on Woody Sites (Downscaled) + +This section compares three treatment scenarios on the same set of woody sites using the downscaled outputs: + +- 100% woody (baseline) +- 100% woody + 50% annual (overlap) +- 100% annual (counterfactual on woody sites) + +```{r woody-treatments} +treat_path <- file.path(model_outdir, "treatments_woody_sites.csv") +if (file.exists(treat_path)) { + tr <- readr::read_csv(treat_path, show_col_types = FALSE) + + # County-level per-ensemble totals and densities, then average across ensembles + tr_county <- tr |> + dplyr::group_by(model_output, scenario, county, ensemble) |> + dplyr::summarise(total_Mg = sum(total_c_Mg, na.rm = TRUE), total_ha = sum(area_ha, na.rm = TRUE), .groups = "drop") |> + dplyr::mutate( + density_Mg_ha = dplyr::if_else(total_ha > 0, total_Mg / total_ha, NA_real_), + total_Tg = PEcAn.utils::ud_convert(total_Mg, "Mg", "Tg") + ) |> + dplyr::group_by(model_output, scenario, county) |> + dplyr::summarise( + mean_total_Tg = mean(total_Tg, na.rm = TRUE), + mean_density_Mg_ha = mean(density_Mg_ha, na.rm = TRUE), + .groups = "drop" + ) + + # Bring in county boundaries for maps + county_boundaries <- sf::st_read(file.path(data_dir, "ca_counties.gpkg"), quiet = TRUE) + + # Differences vs 100% woody baseline + base <- tr_county |> + dplyr::filter(scenario == "woody_100") |> + dplyr::select(model_output, county, base_total_Tg = mean_total_Tg, base_density_Mg_ha = mean_density_Mg_ha) + diffs <- tr_county |> + dplyr::filter(scenario != "woody_100") |> + dplyr::left_join(base, by = c("model_output", "county")) |> + dplyr::mutate( + delta_total_Tg = mean_total_Tg - base_total_Tg, + delta_density_Mg_ha = mean_density_Mg_ha - base_density_Mg_ha + ) |> + dplyr::left_join(county_boundaries, by = "county") + + # Simple maps: density deltas by scenario and pool + ggplot2::ggplot(diffs, ggplot2::aes(geometry = geom, fill = delta_density_Mg_ha)) + + ggplot2::geom_sf(color = "black", fill = NA) + + ggplot2::geom_sf() + + ggplot2::scale_fill_gradient2(low = "royalblue", mid = "white", high = "firebrick", midpoint = 0, na.value = "white") + + ggplot2::facet_grid(model_output ~ scenario) + + ggplot2::theme_minimal() + + ggplot2::labs(title = "County-level density delta vs 100% woody (on woody sites)", fill = "Delta (Mg/ha)") + + # Table preview + DT::datatable( + diffs |> + dplyr::select(model_output, scenario, county, delta_total_Tg, delta_density_Mg_ha) |> + dplyr::arrange(model_output, scenario, county), + options = list(pageLength = 10, scrollY = "50vh", scroller = TRUE, deferRender = TRUE), + extensions = c("Scroller"), + rownames = FALSE + ) |> + DT::formatSignif(columns = c("delta_total_Tg", "delta_density_Mg_ha"), digits = 2) +} else { + cat("treatments_woody_sites.csv not found; run scripts/040_downscale.R to generate.") +} +``` + **Selecting Representative Sites** Next we select representative sites based on their productivity. We use AGB in the 100% woody scenario as an indicator of productivity. We select sites that are near the 15th, 50th, and 85th percentiles of AGB and categorize them as low, medium, and high productivity. ```{r select-representative-sites} agb_site <- combined_output |> - filter(mix_description == "100% woody", variable == "AGB") |> - group_by(site_id) |> - summarise(agb_mean = mean(value_combined), .groups = "drop") + filter(mix_description == "100% woody", variable == "AGB") |> + group_by(site_id) |> + summarise(agb_mean = mean(value_combined), .groups = "drop") if (nrow(agb_site) >= 3) { - qs_agb <- quantile( - agb_site$agb_mean, - probs = c(0.15, 0.50, 0.85) - ) - analysis_sites <- purrr::map_chr(qs_agb, function(qv) { - agb_site |> - mutate(diff = abs(agb_mean - qv)) |> - slice_min(diff, n = 1, with_ties = FALSE) |> - pull(site_id) - }) |> unique() - - if (length(analysis_sites) != 3) { - PEcAn.logger::logger.error("Representative sites not found for all quantiles") - } - site_map <- tibble( - site_id = analysis_sites, - agb_class = factor(c("low_agb", "med_agb", "high_agb"), - levels = c("low_agb", "med_agb", "high_agb")) - ) + qs_agb <- quantile( + agb_site$agb_mean, + probs = c(0.15, 0.50, 0.85) + ) + analysis_sites <- purrr::map_chr(qs_agb, function(qv) { + agb_site |> + mutate(diff = abs(agb_mean - qv)) |> + slice_min(diff, n = 1, with_ties = FALSE) |> + pull(site_id) + }) |> unique() + + if (length(analysis_sites) != 3) { + PEcAn.logger::logger.error("Representative sites not found for all quantiles") + } + site_map <- tibble( + site_id = analysis_sites, + agb_class = factor(c("low_agb", "med_agb", "high_agb"), + levels = c("low_agb", "med_agb", "high_agb") + ) + ) } else { - PEcAn.logger::logger.severe("Looks like something went wrong", - "There are only", nrow(agb_site), " sites with AGB data. ") + PEcAn.logger::logger.severe( + "Looks like something went wrong", + "There are only", nrow(agb_site), " sites with AGB data. " + ) } # join and relabel classes for plotting (use human-friendly labels) output_representative_sites <- site_map |> - left_join(combined_output, by = "site_id") |> - mutate( - agb_class = factor( - agb_class, - levels = c("low_agb", "med_agb", "high_agb"), - labels = c("low AGB site", "med AGB site", "high AGB site") + left_join(combined_output, by = "site_id") |> + mutate( + agb_class = factor( + agb_class, + levels = c("low_agb", "med_agb", "high_agb"), + labels = c("low AGB site", "med AGB site", "high AGB site") + ) ) - ) ``` @@ -291,30 +370,30 @@ TODO add initial conditions to table ```{r summary-table} summary_stats <- combined_output |> - filter(year(datetime) == max(year(datetime))) |> - group_by(variable, mix_description, mix_type) |> - summarise( - mean = mean(value_combined), - sd = sd(value_combined), - .groups = "drop" - ) |> - mutate( - # format as "mean (sd)" with one decimal place - val = sprintf("%.1f (%.1f)", mean, sd) - ) |> - select(-mean, -sd) |> - tidyr::pivot_wider( - id_cols = c(mix_description, mix_type), - names_from = variable, - values_from = val - ) + filter(year(datetime) == max(year(datetime))) |> + group_by(variable, mix_description, mix_type) |> + summarise( + mean = mean(value_combined), + sd = sd(value_combined), + .groups = "drop" + ) |> + mutate( + # format as "mean (sd)" with one decimal place + val = sprintf("%.1f (%.1f)", mean, sd) + ) |> + select(-mean, -sd) |> + tidyr::pivot_wider( + id_cols = c(mix_description, mix_type), + names_from = variable, + values_from = val + ) # TODO future iterations, don't assume columns; # use glue to construct dynamically knitr::kable( - summary_stats, - col.names = c("Mixture", "Type", "AGB", "SOC"), - caption = "Summary statistics (mean and standard deviation) for carbon pools by mixture type." + summary_stats, + col.names = c("Mixture", "Type", "AGB", "SOC"), + caption = "Summary statistics (mean and standard deviation) for carbon pools by mixture type." ) ``` @@ -336,61 +415,61 @@ last_year <- lubridate::year(last_dt) # Final-time values per (site x ensemble) combined_output_final_site_ens <- combined_output |> - filter(datetime == last_dt) |> - group_by(variable, mix_description, mix_type, site_id, ensemble_id) |> - summarise( - value = median(value_combined), - .groups = "drop" - ) + filter(datetime == last_dt) |> + group_by(variable, mix_description, mix_type, site_id, ensemble_id) |> + summarise( + value = median(value_combined), + .groups = "drop" + ) # Points: site-level means across ensembles at final time combined_output_site_mean <- combined_output_final_site_ens |> - group_by(variable, mix_description, mix_type, site_id) |> - summarise( - site_mean = mean(value), - .groups = "drop" - ) + group_by(variable, mix_description, mix_type, site_id) |> + summarise( + site_mean = mean(value), + .groups = "drop" + ) # Error bars + median dot: summarise across sites (using site-level means) combined_output_summaries <- combined_output_site_mean |> - group_by(variable, mix_description, mix_type) |> - summarise( - q25 = quantile(site_mean, 0.25), - q50 = quantile(site_mean, 0.5), - q75 = quantile(site_mean, 0.75), - .groups = "drop" - ) - - # Plot: ensemble points (jitter) + IQR + median; facet by variable - ggplot() + - geom_jitter( + group_by(variable, mix_description, mix_type) |> + summarise( + q25 = quantile(site_mean, 0.25), + q50 = quantile(site_mean, 0.5), + q75 = quantile(site_mean, 0.75), + .groups = "drop" + ) + +# Plot: ensemble points (jitter) + IQR + median; facet by variable +ggplot() + + geom_jitter( data = combined_output_site_mean, aes(x = mix_description, y = site_mean, color = mix_type), width = 0.15, alpha = 0.5, size = 1.2 - ) + - geom_errorbar( + ) + + geom_errorbar( data = combined_output_summaries, aes(x = mix_description, ymin = q25, ymax = q75, color = mix_type), width = 0.4, linewidth = 0.6 - ) + - geom_point( + ) + + geom_point( data = combined_output_summaries, aes(x = mix_description, y = q50, color = mix_type), size = 2 - ) + - facet_wrap(~ variable, scales = "free_y", nrow = 1, labeller = mix_labeller) + # use mix_labeller - scale_x_discrete(labels = mix_label_fn) + # wrap x labels into two lines - scale_color_manual(values = pal_mix) + - labs( + ) + + facet_wrap(~variable, scales = "free_y", nrow = 1, labeller = mix_labeller) + # use mix_labeller + scale_x_discrete(labels = mix_label_fn) + # wrap x labels into two lines + scale_color_manual(values = pal_mix) + + labs( title = paste0("End-Year Carbon Pool Sizes (", last_year, ")"), subtitle = "Points = site means; bars = IQR; dot = median", x = "Mixture", y = expression("Carbon Pool Size (" * kg ~ C ~ m^-2 * ")"), color = "Type" - ) + - base_theme + - theme(axis.text.x = element_text(angle = 35, hjust = 1)) - + ) + + base_theme + + theme(axis.text.x = element_text(angle = 35, hjust = 1)) + ``` ### Effect Size (Final Year AGB and SOC Compared to Annual Monoculture) @@ -498,6 +577,26 @@ ggplot() + ) ``` +## Effect on County Level Carbon Stocks + +Here we present county-level maps showing the impact of adding 50% ground cover to orchards on carbon stocks and densities. Positive values indicate an increase relative to woody-only. + +These maps are generated from the downscaled outputs that are presented in the [Downscaling Results](../reports/downscaling_results.qmd#results) page. + +### Change in Carbon Stocks (Tg): + +![](../figures/county_diff_woody_plus_annual_minus_woody_TotSoilCarb_carbon_stock.webp) + + +![](../figures/county_diff_woody_plus_annual_minus_woody_AGB_carbon_stock.webp) + + +### Change in Carbon Density (Mg/ha): + +![](../figures/county_diff_woody_plus_annual_minus_woody_TotSoilCarb_carbon_density.webp) + +![](../figures/county_diff_woody_plus_annual_minus_woody_AGB_carbon_density.webp) + ## Other Approaches Considered @@ -577,16 +676,17 @@ for (v in vars) { aes(datetime, q50, group = mix_description), color = "black", linewidth = .5 ) + - facet_wrap(~mix_description, - nrow = 1, - labeller = mix_labeller, - scales = "free_y") + + facet_wrap(~mix_description, + nrow = 1, + labeller = mix_labeller, + scales = "free_y" + ) + year_scale_dt + scale_color_manual(values = pal_mix) + labs( title = paste("Time Series (Median & IQR):", v), subtitle = "Lines=ensembles; black=median; ribbon=IQR", - x = NULL, y = bquote("Carbon Pool Size (" * kg ~ C ~ m^-2 * ")"), + x = NULL, y = bquote("Carbon Pool Size (" * kg ~ C ~ m^-2 * ")"), color = "Category" ) + base_theme @@ -667,4 +767,4 @@ for (v in vars) { | Annual → Perennial | Change PFT; partial biomass retained. | High | Low | Low | | Agroforestry / Woody Buffers | Independent woody runs; spatial aggregation. | High | Low | Low | -: Healthy Soils Program practices, model representation, and priority for implementation in SIPNET. {#tbl-hsp} \ No newline at end of file +: Healthy Soils Program practices, model representation, and priority for implementation in SIPNET. {#tbl-hsp} diff --git a/docs/references.md b/docs/references.md new file mode 100644 index 0000000..4655129 --- /dev/null +++ b/docs/references.md @@ -0,0 +1,34 @@ +# References + +**EFI Standards** + +Dietze, Michael C., R. Quinn Thomas, Jody Peters, Carl Boettiger, Gerbrand Koren, Alexey N. Shiklomanov, and Jaime Ashander. 2023. “A Community Convention for Ecological Forecasting: Output Files and Metadata Version 1.0.” Ecosphere 14 (11): e4686. https://doi.org/10.1002/ecs2.4686. + +**CADWR LandIQ Crop Map** + +California Department of Water Resources. (2018). Statewide Crop Mapping—California Natural Resources Agency Open Data. Retrieved “Oct 14, 2024” from https://data.cnra.ca.gov/dataset/statewide-crop-mapping. + +**CalAdapt** + +Lyons, Andrew and R Development Core Team. 2025. “Caladaptr: Tools for the Cal-Adapt API in R.” Manual. https://ucanr-igis.github.io/caladaptr. + +**SoilGrids250m** + +Hengl, T. et al. 2017. “SoilGrids250m: Global Gridded Soil Information Based on Machine Learning.” PLoS ONE 12(2): e0169748. https://doi.org/10.1371/journal.pone.0169748 + +**ERA5 Climate Data** + +Hersbach, H. et al. 2020. “The ERA5 Global Reanalysis.” Quarterly Journal of the Royal Meteorological Society 146: 1999–2049. https://doi.org/10.1002/qj.3803 + +**CalAdapt Climate Zones** +CalAdapt. 2024. “Climate Zones.” Accessed October 14, 2024. https://cal-adapt.org/tools/climate-zones/. + +**SIPNET** + +Braswell, Bobby H., William J. Sacks, Ernst Linder, and David S. Schimel. 2005. “Estimating Diurnal to Annual Ecosystem Parameters by Synthesis of a Carbon Flux Model with Eddy Covariance Net Ecosystem Exchange Observations.” Global Change Biology 11 (2): 335–55. https://doi.org/10.1111/j.1365-2486.2005.00897.x. + +Sacks, William J., David S. Schimel, Russell K. Monson, and Bobby H. Braswell. 2006. “Model‐data Synthesis of Diurnal and Seasonal CO2 Fluxes at Niwot Ridge, Colorado.” Global Change Biology 12 (2): 240–59. https://doi.org/10.1111/j.1365-2486.2005.01059.x. + +**Random Forest** + +Liaw, Andy, and Matthew Wiener. 2002. “Classification and Regression by randomForest.” R News 2 (3): 18–22. https://CRAN.R-project.org/doc/Rnews/. \ No newline at end of file diff --git a/docs/workflow_documentation.md b/docs/workflow_documentation.md index 0be98d0..b35976e 100644 --- a/docs/workflow_documentation.md +++ b/docs/workflow_documentation.md @@ -1,23 +1,20 @@ --- -title: "Downscaling Workflow Documentation" +title: "Technical Documentation" author: "David LeBauer" date: "`r Sys.Date()`" format: html: - self-contained: true - embed-resources: true + self-contained: false toc: true execute: echo: false --- -# Workflow Overview - The downscaling workflow predicts carbon pools (Soil Organic Carbon and Aboveground Biomass) for cropland fields in California and then aggregates these predictions to the county scale. It uses an ensemble-based approach to uncertainty propagation and analysis, maintaining ensemble structure to propagate errors through the prediction and aggregation processes. -![Spatial downscaling workflow using machine learning with environmental covariates](figures/spatial_downscaling_workflow.png){width="5in"} +![Spatial downscaling workflow using machine learning with environmental covariates](../figures/spatial_downscaling_workflow.webp){width="5in"} ## Terminology @@ -35,11 +32,15 @@ The workflows are 2. Ensemble in ccmmf/workflows repository, generates ensemble outputs 3. **Downscaling**: uses ensemble outputs to make predictions for each field in CA then aggregate to county level summaries. + + ## Workflow Steps ### Configuration -Workflow settings are configured in `000-config.R`. +Workflow settings are configured in `000-config.R`. The configuration script reads the CCMMF directory from the environment variable `CCMMF_DIR` (set in .Renviron), and uses it to define paths for inputs and outputs. @@ -65,7 +66,6 @@ git clone git@github.com:ccmmf/downscaling These are in a subdirectory of the CCMMF directory in order to make them available across all users (and because on some computers, they exceed allocated space in the home directory). - `R_LIBS_USER` must point to the platform and R version specific subdirectory inside `RENV_PATHS_LIBRARY`. - Example: `/projectnb/dietzelab/ccmmf/renv-library/linux-almalinux-8.10/R-4.4/x86_64-pc-linux-gnu` - `.Rprofile` - sets repositories from which R packages are installed - runs `renv/activate.R` @@ -75,16 +75,6 @@ git clone git@github.com:ccmmf/downscaling - detect and use resources for parallel processing (with future package); default is `available cores - 1` - PRODUCTION mode setting. For testing, set `PRODUCTION` to `FALSE`. This is _much_ faster and requires fewer computing resources because it subsets large datasets. Once a test run is successful, set `PRODUCTION` to `TRUE` to run the full workflow. -**Others:** - -_these shouldn't need to be changed unless you want to change the default behavior of the workflow_ - -- `renv.lock` is used for package management with `renv`. - - See [project renv setup docs](docs/renv_setup.md) for instructions about using `renv` for these workflows. - - See [renv package documentation](https://rstudio.github.io/renv/articles/renv.html) for more details. - -# - ### 1. Data Preparation ```sh @@ -210,6 +200,29 @@ PEcAn standard units are SI, following the Climate Forecasting standards: ### 4. Extract SIPNET Output + +First, uncompress the model output. Only the netCDF files are needed. + +```sh +# Set paths +ccmmf_dir=/projectnb2/dietzelab/ccmmf +archive_file="$ccmmf_dir/lebauer_agu_2025_20251210.tgz" +output_dir="$ccmmf_dir/modelout/ccmmf_phase_3_scenarios_20251210" + +# Ensure output directory exists +mkdir -p "$output_dir" + +# +tar --use-compress-program="pigz -d" -xf \ + "$archive_file" \ + -C "$output_dir" \ + --strip-components=1 \ + --no-same-owner #\ +# --wildcards \ +# 'lebauer_agu_2025/output_/out/' \ +# 'lebauer_agu_2025/output_/out/ENS--/[0-9][0-9][0-9][0-9].nc' +``` + ```sh Rscript scripts/030_extract_sipnet_output.R ``` @@ -226,14 +239,33 @@ Extracts and formats SIPNET outputs for downscaling: **Outputs:** - `out/ensemble_output.csv`: Long format data -### 5. Downscale and Aggregate SIPNET Output +### 5. Mixed Cropping Systems ```sh -Rscript scripts/040_downscale_and_aggregate.R -Rscript scripts/041_downscale_analysis.R +Rscript scripts/031_aggregate_sipnet_output.R ``` -Builds Random Forest models to predict carbon pools for all fields: +Simulates mixed-cropping scenarios by combining outputs across two PFTs using `combine_mixed_crops()` (see Mixed System Prototype). Two methods are supported: + +- weighted: area-partitioned mix where `woody_cover + annual_cover = 1` +- incremental: preserve woody baseline (`woody_cover = 1`) and add the annual delta scaled by `annual_cover` + +`combine_mixed_crops()` is pool-agnostic: pass any additive quantity expressed per unit area, including instantaneous stocks +(`kg/m^2`) or total flux totals that have already been accumulated over the SIPNET output interval (e.g., hourly or annual `kg/m^2` of NEE). +(`kg/m^2`) or total flux over a defined time step. + +Outputs include `multi_pft_ensemble_output.csv`, `combined_ensemble_output.csv`, and `ensemble_output_with_mixed.csv` with a synthetic mixed PFT. + +### 6. Downscale, Aggregate to County, and Plot + +```sh +Rscript scripts/040_downscale.R +Rscript scripts/041_aggregate_to_county.R +Rscript scripts/042_downscale_analysis.R +Rscript scripts/043_county_level_plots.R +``` + +Builds Random Forest models to predict carbon pools for all fields; aggregates to county-level; summarizes variable importance; and produces maps: - Train models on SIPNET ensemble runs at design points - Use environmental covariates to downscale predictions to all fields @@ -241,38 +273,60 @@ Builds Random Forest models to predict carbon pools for all fields: - Output maps and statistics of carbon density and totals **Inputs:** -- `out/ensemble_output.csv`: SIPNET outputs +- `model_outdir/ensemble_output.csv`: SIPNET outputs extracted in step 4 - `data/site_covariates.csv`: Environmental covariates -**Outputs:** -- `out/county_total_AGB.png`: County-level AGB predictions -- `out/county_total_TotSoilCarb.png`: County-level SOC predictions -- `out/county_summaries.csv`: County statistics +**Outputs from `040_downscale.R`:** +- `model_outdir/downscaled_preds.csv`: Per-site, per-ensemble predictions with totals and densities +- `model_outdir/downscaled_preds_metadata.json`: Metadata for predictions +- `model_outdir/training_sites.csv`: Training site list per PFT and pool +- `cache/models/*_models.rds`: Saved RF models per spec +- `cache/training_data/*_training.csv`: Training covariate matrices per spec +- `model_outdir/downscaled_deltas.csv`: Optional start→end deltas when available -## Technical Reference +**Outputs from `041_aggregate_to_county.R`:** +- `model_outdir/county_summaries.csv`: County statistics (means/SDs across ensembles for stocks and densities) -### Ensemble Structure +**Outputs from `042_downscale_analysis.R` (saved in `figures/`):** +- `__importance_partial_plots.png`: Variable importance with partial plots for top predictors +- `__ALE_predictor.svg` and `__ICE_predictor.svg`: ALE and ICE plots -Each ensemble member represents a plausible realization given parameter and meteorological uncertainty. This ensemble structure is maintained throughout the workflow to properly propagate uncertainty. For example, downscaling is done for each ensemble member separately, and then the results are aggregated to county-level statistics. +**Outputs from `043_county_level_plots.R` (saved in `figures/`):** +- `county___carbon_stock.webp` and `county___carbon_density.webp`: County-level maps +- `county_diff_woody_plus_annual_minus_woody__carbon_{density,stock}.webp`: Scenario difference maps +- `county_delta___carbon_{density,stock}.webp`: Start→end delta maps when available +## Running on BU Cluster +Interactive session (example): -# References - -**EFI Standards** +```sh +qrsh -l h_rt=3:00:00 -pe omp 16 -l buyin +``` -Dietze, Michael C., R. Quinn Thomas, Jody Peters, Carl Boettiger, Gerbrand Koren, Alexey N. Shiklomanov, and Jaime Ashander. 2023. "A Community Convention for Ecological Forecasting: Output Files and Metadata Version 1.0." Ecosphere 14 (11): e4686. https://doi.org/10.1002/ecs2.4686. +Submit a batch job (example using downscaling step): -**Data Sources** +```sh +qsub \ + -l h_rt=4:00:00 \ + -pe omp 8 \ + -o logs/040.out \ + -e logs/040.err \ + -b y Rscript scripts/040_downscale.R +``` -Land IQ, LLC. California Crop Mapping (2014). California Department of Water Resources, 2017. https://data.cnra.ca.gov/dataset/statewide-crop-mapping. +## Documentation Site (Quarto) -Hengl, T. et al. 2017. "SoilGrids250m: Global Gridded Soil Information Based on Machine Learning." PLoS ONE 12(2): e0169748. https://doi.org/10.1371/journal.pone.0169748 +- Preview: `quarto preview` +- Build: `quarto render` +- Publish to GitHub Pages: `quarto publish gh-pages` (publishes `_site/`) -Hersbach, H. et al. 2020. "The ERA5 Global Reanalysis." Quarterly Journal of the Royal Meteorological Society 146: 1999–2049. https://doi.org/10.1002/qj.3803 +Notes: +- Quarto config is in `_quarto.yml` and the home page is `index.qmd` (includes `README.md`). +- Uses `freeze: auto` to cache outputs; rebuild where data paths exist (see `000-config.R`). -**Models** +## Technical Reference -Braswell, Bobby H., William J. Sacks, Ernst Linder, and David S. Schimel. 2005. "Estimating Diurnal to Annual Ecosystem Parameters by Synthesis of a Carbon Flux Model with Eddy Covariance Net Ecosystem Exchange Observations." Global Change Biology 11 (2): 335–55. https://doi.org/10.1111/j.1365-2486.2005.00897.x. +### Ensemble Structure -Liaw, Andy, and Matthew Wiener. 2002. "Classification and Regression by randomForest." R News 2 (3): 18–22. https://CRAN.R-project.org/doc/Rnews/. +Each ensemble member represents a plausible realization given parameter and meteorological uncertainty. This ensemble structure is maintained throughout the workflow to properly propagate uncertainty. For example, downscaling is done for each ensemble member separately, and then the results are aggregated to county-level statistics. diff --git a/docs/figures/spatial_downscaling_workflow.png b/figures/spatial_downscaling_workflow.png similarity index 100% rename from docs/figures/spatial_downscaling_workflow.png rename to figures/spatial_downscaling_workflow.png diff --git a/figures/spatial_downscaling_workflow.webp b/figures/spatial_downscaling_workflow.webp new file mode 100644 index 0000000..2fac4a2 Binary files /dev/null and b/figures/spatial_downscaling_workflow.webp differ diff --git a/index.qmd b/index.qmd new file mode 100644 index 0000000..437ca5c --- /dev/null +++ b/index.qmd @@ -0,0 +1,2 @@ + +{{< include README.md >}} diff --git a/reports/design_points_analysis.qmd b/reports/design_points_analysis.qmd new file mode 100644 index 0000000..cbc0a63 --- /dev/null +++ b/reports/design_points_analysis.qmd @@ -0,0 +1,55 @@ +--- +title: "Design Points Analysis" +format: + html: + self-contained: false + df-print: paged + toc: true +--- + +Design points are representative locations selected to capture the range of environmental conditions across California croplands from CADWR (2018). +These are the locations where SIPNET is run to produce outputs later used in downscaling to all ~600k crop fields. + +We used clustering to select design points that represent the range of environmental conditions across California croplands. +The following figures illustrate the distribution of these design points, which were clustered based on environmental covariates. + +The environmental covariates used for clustering are: + +| Variable | Description | Source | Units | +|----------|-----------------------------|---------------|-----------| +| temp | Mean annual temperature | ERA5 | °C | +| precip | Mean annual precipitation | ERA5 | mm/year | +| srad | Solar radiation | ERA5 | W/m² | +| vapr | Vapor pressure deficit | ERA5 | kPa | +| clay | Clay content | SoilGrids | % | +| ocd | Organic carbon density | SoilGrids | g/kg | +| twi | Topographic wetness index | SRTM-derived | – | + + +## Map of Selected Design Points + +Here we check the geographic distribution of design points relative to the distribution of all croplands. +Grey areas are the fields in the CADWR (2018) land use dataset. +Boundaries are Climate Zones from CalAdapt (Lyons, 2025). + +The design points should be well distributed across California croplands and Climate Zones. + +![Clustered Design Points](../figures/design_points.webp) + +## Relationships Among Environmental Covariates + +This pairs plot shows the relationships between covariates used for clustering, with colors indicating cluster membership. + +The clusters should show distinct groupings based on the environmental covariates. + +![Clustered Pairs Plot](../figures/cluster_pairs.webp) + +## Environmental Characteristics of each Cluster + +These plots present the normalized mean values of environmental covariates for each cluster. + +This summary highlights the characteristics of each cluster based on the environmental covariates used for clustering. + +Here we expect to see clusters with distinct environmental signatures, reflecting the unique multivariate profiles of each cluster. + +![Cluster Summary](../figures/cluster_plot.svg) diff --git a/reports/downscaling_results.qmd b/reports/downscaling_results.qmd index f3d5031..a85112f 100644 --- a/reports/downscaling_results.qmd +++ b/reports/downscaling_results.qmd @@ -1,12 +1,11 @@ --- title: "Downscaling Results and Analysis" author: "David LeBauer" -date: "`r Sys.Date()`" +date: today format: html: - self-contained: true - embed-resources: true - df-print: paged + self-contained: false + df-print: default toc: true execute: echo: false @@ -16,185 +15,304 @@ execute: # Overview -This document presents the results of downscaling workflow: soil carbon stocks and aboveground biomass in California cropland fields, aggregated to the county level. +Here we present results of aggregating from field scale pool sizes to county-level estimates of county-level carbon stocks and densities for soil carbon stocks to 30cm and aboveground biomass. -The Results section presents the county-level carbon stocks and densities for soil carbon and aboveground biomass. The Analysis section explores results from the downscaling and aggregation analysis steps. +**Not validated, for illustrative purposes only.** -For detailed information about methods and workflow, please see the Workflow Documentation in [`docs/workflow_documentation.md`](https://github.com/ccmmf/downscaling/blob/update_site_selection/docs/workflow_documentation.md). +For detailed information about methods and workflow, see the [Workflow Documentation](../docs/workflow_documentation.md). -# Results - -## County-Level Carbon Stocks and Densities - -The following maps illustrate the spatial variation and uncertainty (mean and standard deviation) of the predicted carbon pools at the county level. - -### Soil Carbon (TotSoilCarb) +Estimates for woody and mixed woody + annual systems are done for all fields in the LandIQ dataset with tree and vinyard crops; estimates for annual systems are done for all fields with herbaceous annual crops. -#### Carbon Stock by County +In the mixed scenario, ground cover is planted in all orchards. The methods for simulating the addition of ground cover to orchards are described in [Mixed System Prototype](../docs/mixed_system_prototype.qmd). Briefly, it combines downscaled model outputs from the woody perennial crop and annual crop PFTs to represent the effect of ground cover on carbon pools. -![](../figures/county_TotSoilCarb_carbon_stock.png) - -#### Carbon Density by County +# Results -![](../figures/county_TotSoilCarb_carbon_density.png) +## County-Level Carbon Stocks and Densities -### Aboveground Biomass (AGB) +The maps below are organized to compare across PFTs. First we show total county carbon (Stocks, Tg), then area‑normalized density (Density, Mg/ha), each split by carbon pool and PFT. -#### Carbon Stock by County +### Stocks -![](../figures/county_AGB_carbon_stock.png) +#### Soil Carbon (TotSoilCarb) -#### Carbon Density by County +##### Annual Crop -![](../figures/county_AGB_carbon_density.png) +![](../figures/county_annual_crop_TotSoilCarb_carbon_stock.webp) +##### Woody Perennial Crop (Orchards & Vineyards) -### Table of Carbon Stocks and Density by County +![](../figures/county_woody_perennial_crop_TotSoilCarb_carbon_stock.webp) -The table below provides a searchable summary of the county-level carbon stocks and densities. +##### Woody + Annual (Orchards & Vineyards with Ground Cover) -```{r} -source(here::here("000-config.R")) +![](../figures/county_woody_annual_TotSoilCarb_carbon_stock.webp) -# Load county summaries data -county_summaries <- readr::read_csv(file.path(model_outdir, "county_summaries.csv"), - show_col_types = FALSE) -#colnames(county_summaries) -# Combine mean and SD into a single column for carbon density -county_summaries_table <- county_summaries |> - dplyr::mutate( - `Mean Total C (Tg/county)` = paste0( - signif(mean_total_c_Tg, 2), - " (", signif(sd_total_c_Tg, 2), ")" - ), - `Mean C Density (Mg/ha)` = paste0( - signif(mean_c_density_Mg_ha, 2), - " (", signif(sd_c_density_Mg_ha, 2), ")" - ) - ) |> - dplyr::rename( - `Carbon Pool` = model_output, - `County` = county, - `# Fields` = n - ) |> - dplyr::select(`Carbon Pool`, `County`, `# Fields`, `Mean Total C (Tg/county)`, `Mean C Density (Mg/ha)`) +#### Aboveground Biomass (AGB) -# Create Table -# TODO -# - Fix point w/ missing county +##### Annual Crop -htmlwidgets::setWidgetIdSeed(123) # required to embed table self-contained in html -options(htmlwidgets.TEMP_DIR = "htmlwidgets") +![](../figures/county_annual_crop_AGB_carbon_stock.webp) -DT::datatable( - county_summaries_table, - options = list( - pageLength = 10, - searchHighlight = TRUE - ), - rownames = FALSE, - escape = FALSE -) -``` +##### Woody Perennial Crop (Orchards & Vineyards) -# Analysis +![](../figures/county_woody_perennial_crop_AGB_carbon_stock.webp) -## Design Point Distribution +##### Woody + Annual (Orchards & Vineyards with Ground Cover) -Design points are representative locations selected to capture the range of environmental conditions across California croplands from CADWR (2018). -These are the locations where SIPNET is run to produce outputs later used in downscaling to all ~600k crop fields. +![](../figures/county_woody_annual_AGB_carbon_stock.webp) -We used clustering to select design points that represent the range of environmental conditions across California croplands. -The following figures illustrate the distribution of these design points, which were clustered based on environmental covariates. +### Density -The environmental covariates used for clustering are: +#### Soil Carbon (TotSoilCarb) +##### Annual Crop -| Variable | Description | Source | Units | -|----------|-------------|--------|-------| -| temp | Mean annual temperature | ERA5 | °C | -| precip | Mean annual precipitation | ERA5 | mm/year | -| srad | Solar radiation | ERA5 | W/m² | -| vapr | Vapor pressure deficit | ERA5 | kPa | -| clay | Clay content | SoilGrids | % | -| ocd | Organic carbon density | SoilGrids | g/kg | -| twi | Topographic wetness index | SRTM-derived | - | +![](../figures/county_annual_crop_TotSoilCarb_carbon_density.webp) +##### Woody Perennial Crop (Orchards & Vineyards) -### Map of Selected Design Points +![](../figures/county_woody_perennial_crop_TotSoilCarb_carbon_density.webp) -Here we check the geographic distribution of design points relative to the distribution of all croplands. -Grey areas are the fields in the CADWR (2018) land use dataset. -Boundaries are Climate Zones from CalAdapt (Lyons, 2025). +##### Woody + Annual (Orchards & Vineyards with Ground Cover) +![](../figures/county_woody_annual_TotSoilCarb_carbon_density.webp) -The design points should be well distributed across California croplands and Climate Zones. +#### Aboveground Biomass (AGB) -![Clustered Design Points](../figures/design_points.png) +##### Annual Crop -### Relationships Among Environmental Covariates +![](../figures/county_annual_crop_AGB_carbon_density.webp) -This pairs plot shows the relationships between covariates used for clustering, with colors indicating cluster membership. +##### Woody Perennial Crop (Orchards & Vineyards) -The clusters should show distinct groupings based on the environmental covariates. +![](../figures/county_woody_perennial_crop_AGB_carbon_density.webp) -![Clustered Pairs Plot](../figures/cluster_pairs.png) +##### Woody + Annual (Orchards & Vineyards with Ground Cover) -### Environmental Characteristics of each Cluster +![](../figures/county_woody_annual_AGB_carbon_density.webp) -These plots present This the normalized mean values of environmental covariates for each cluster. +### Tables: County Stocks and Density (by PFT) and PFT Comparison -This summary highlights the characteristics of each cluster based on the environmental covariates used for clustering. +The first table summarizes county-level metrics for each PFT. The second table pivots carbon pools to highlight differences across cropping systems.. -Here we expect to see clusters with distinct environmental signatures, reflecting the unique multivariate profiles of each cluster. +```{r, options, include=FALSE} +options(ccmmf.quiet_banner = TRUE) +source(here::here("000-config.R")) +``` -![Cluster Summary](../figures/cluster_plot.png) +```{r,load_inputs, message=FALSE, warning=FALSE} -## Variable Importance and Partial Dependence +# Load county summaries data +county_summaries <- readr::read_csv( + file.path(model_outdir, "county_summaries.csv"), + show_col_types = FALSE +) -The following plots show the variable importance from the random forest models used for downscaling, along with partial dependence plots for the top two predictors. +# Long table with PFT + area +county_summaries_table <- county_summaries |> + dplyr::mutate( + `Mean Total C (Tg/county)` = paste0(signif(mean_total_c_Tg, 2), " (", signif(sd_total_c_Tg, 2), ")"), + `Mean C Density (Mg/ha)` = paste0(signif(mean_c_density_Mg_ha, 2), " (", signif(sd_c_density_Mg_ha, 2), ")"), + `Area (ha)` = signif(mean_total_ha, 3) + ) |> + dplyr::rename( + `Carbon Pool` = model_output, + `PFT` = pft, + `County` = county, + `# Fields` = n + ) |> + dplyr::select(`Carbon Pool`, `PFT`, `County`, `# Fields`, `Area (ha)`, `Mean Total C (Tg/county)`, `Mean C Density (Mg/ha)`) -Variable importance quantifies how useful each covariate is in predicting the carbon stock. Partial dependence plots show the marginal effect of individual predictors on model response after averaging over the other predictors. +htmlwidgets::setWidgetIdSeed(123) -### Total Soil Carbon +DT::datatable( + county_summaries_table, + extensions = c('Scroller','SearchPanes','Select'), + options = list( + dom = 'Plrtip', + pageLength = 10, + searchHighlight = FALSE, + deferRender = TRUE, + scroller = TRUE, + scrollY = "60vh", + searchPanes = list(cascadePanes = TRUE, initCollapsed = TRUE), + columnDefs = list(list(searchPanes = list(show = TRUE), targets = c(0,1))) + ), + class = "stripe hover compact", + rownames = FALSE, escape = FALSE +) -![](../figures/TotSoilCarb_importance_partial_plots.png) +# Wide comparison: TotSoilCarb density by PFT +density_wide <- county_summaries |> + dplyr::select(County = county, `Carbon Pool` = model_output, PFT = pft, `C Density (Mg/ha)` = mean_c_density_Mg_ha) |> + tidyr::pivot_wider(names_from = PFT, values_from = `C Density (Mg/ha)`) |> + dplyr::arrange(`Carbon Pool`, County) + +# Drop rows where the row-wise max value is < MASK_THRESHOLD * colsum of the column where that max occurs, +# evaluated within each Carbon Pool separately +if (exists("MASK_THRESHOLD")) { + mask_group <- function(df) { + vals <- as.matrix(dplyr::select(df, -County, -`Carbon Pool`)) + vals2 <- vals; vals2[is.na(vals2)] <- -Inf + col_sums <- colSums(vals, na.rm = TRUE) + if (all(!is.finite(col_sums)) || sum(col_sums, na.rm = TRUE) <= 0) return(df) + row_max_val <- apply(vals2, 1, max) + row_max_col <- max.col(vals2, ties.method = "first") + thresholds <- col_sums[row_max_col] * get0("MASK_THRESHOLD", ifnotfound = 0.01) + keep <- is.finite(row_max_val) & row_max_val >= thresholds + df[keep, , drop = FALSE] + } + density_wide <- density_wide |> + dplyr::group_split(`Carbon Pool`, .keep = TRUE) |> + purrr::map(mask_group) |> + dplyr::bind_rows() +} -### Aboveground Biomass (AGB) Carbon +DT::datatable( + density_wide, + extensions = c('Scroller','SearchPanes','Select'), + options = list( + dom = 'Plrtip', + pageLength = 10, + searchHighlight = FALSE, + deferRender = TRUE, + scroller = TRUE, + scrollY = "60vh", + searchPanes = list(cascadePanes = TRUE, initCollapsed = TRUE), + columnDefs = list(list(searchPanes = list(show = TRUE), targets = c(0,1))) + ), + class = "stripe hover compact", + rownames = FALSE, escape = FALSE +) +``` -![](../figures/AGB_importance_partial_plots.png) + \ No newline at end of file diff --git a/reports/variable_importance.qmd b/reports/variable_importance.qmd new file mode 100644 index 0000000..a31fdb4 --- /dev/null +++ b/reports/variable_importance.qmd @@ -0,0 +1,32 @@ +--- +title: "Downscaling Model Evaluation" +format: + html: + self-contained: false + toc: true +execute: + freeze: false +--- + +# Variable importance and Partial Dependence Plots (PDP) + +This page reports variable-importance and partial dependence plots for the Random Forest models used to downscale AGB and SOC. + +In the first plot, x axis is the importance of each variable, measured as the increase in mean squared error (MSE) when that variable is permuted (i.e., its values are randomly shuffled). +Higher values indicate that the variable is more important for accurate predictions. + +The next two plots show the partial dependence of the response variable (AGB or SOC) on the top two most important predictors, holding all other predictors at their average values. (= marginal effect of each predictor on the response variable). Here, the y axis is normalized. + +### AGB + +Here we see that vapor pressure deficit (vapr) is the most important variable, the second most important variable is precipitation (precip). Temperature (temp) and solar radiation (srad) are also important. + +PDPs show negative trend - increasing vapr leads to lower AGB. While increasing precip leads to higher AGB up to a point, after which there is a decline before levelling off. + +![](../figures/agb_importance.png){width=100%} + +### SOC (TotSoilCarb) + +Here we see that vapor pressure deficit is again the most important variable, followed by temperature. PDPs show negative trends - increasing vapr or temp leads to lower SOC. + +![](../figures/soc_importance.png){width=100%} diff --git a/scripts/009_update_landiq.R b/scripts/009_update_landiq.R index 1f19a51..bb585c8 100644 --- a/scripts/009_update_landiq.R +++ b/scripts/009_update_landiq.R @@ -1,6 +1,6 @@ -# This script is used to explore LandIQ data -# and reconcile design points generated in previous iterations -# with updated LandIQ data. +# This script is used to explore LandIQ data +# and reconcile design points generated in previous iterations +# with updated LandIQ data. # Will be obsolete once we re-generate new design points in phase 3 ## One off code used to fix LandIQ data has been moved to a gist:.groups @@ -18,11 +18,11 @@ cluster_library(cl, "dplyr") ## Load Configuration ## later can be replaced w/ config.yml or pecan.xml -config_file <- here::here("000-config.R") +config_file <- here::here("000-config.R") if (file.exists(config_file)) { source(config_file) } else { - PEcAn.logger::logger.error( + PEcAn.logger::logger.severe( "Config file not found, are you in the correct directory?", getwd() ) } @@ -33,8 +33,8 @@ crops_all <- data.table::fread( filter(!is.na(CLASS)) |> mutate( SUBCLASS = replace_na(SUBCLASS, 0) - ) - + ) + crop_pft_map <- readr::read_csv( file.path(raw_data_dir, "cadwr_land_use", "CARB_PFTs_table.csv") ) |> @@ -56,7 +56,7 @@ crops_all |> values_from = n, values_fill = 0 ) - + # Perform join to map PFTs crops_all_pft <- crops_all |> left_join( @@ -65,7 +65,7 @@ crops_all_pft <- crops_all |> "CLASS" = "crop_type", "SUBCLASS" = "crop_code" ) - ) + ) # Split into multidplyr_df by year and season # For parallel dplyr crops_all_pft_x <- crops_all_pft |> @@ -75,9 +75,11 @@ crops_all_pft_x <- crops_all_pft |> # 1) How many rows lost pft_group (i.e. join failures) by year/season? crops_all_pft_x |> - summarise(n_total = n(), - n_missing = sum(is.na(pft_group)), - pct_missing = round(n_missing / n_total * 100)) |> + summarise( + n_total = n(), + n_missing = sum(is.na(pft_group)), + pct_missing = round(n_missing / n_total * 100) + ) |> collect() |> print(n = Inf) @@ -150,12 +152,12 @@ woody_herb_summary <- crops_all_pft_x |> woody_pcnt = sum(PCNT[pft_group == "woody"], na.rm = TRUE), herb_pcnt = sum(PCNT[pft_group == "herbaceous"], na.rm = TRUE) ) |> - ungroup() |> - filter(n_pft == 2) - + ungroup() |> + filter(n_pft == 2) + z <- woody_herb_summary |> collect() |> - #filter(woody_pcnt > 0, herb_pcnt > 0) |> + # filter(woody_pcnt > 0, herb_pcnt > 0) |> mutate( total_pcnt = woody_pcnt + herb_pcnt ) @@ -186,7 +188,7 @@ woody_herb_fields_by_year_season |> summarize(n_fields = n_distinct(UniqueID)) |> arrange(year, season) -### Trying reconcile 2016 and 2018 LandIQ data +### Trying reconcile 2016 and 2018 LandIQ data crops_all_2016 <- crops_all |> filter(year == 2016) @@ -196,7 +198,7 @@ crops_all_2018 <- crops_all |> dwr_2018 <- terra::vect( file.path(raw_data_dir, "cadwr_land_use", "LandIQ_shapefiles", "i15_Crop_Mapping_2018_SHP", "i15_Crop_Mapping_2018.shp") -) |> +) |> terra::project("epsg:3310") dwr_2016 <- terra::vect( @@ -210,11 +212,11 @@ dwr_x <- terra::intersect(dwr_2018, dwr_2016) ## Goal is to reconcile dwr_2016 Unique_ID with dwr_2018 UniqueID ## join them by geometry, creating id_2016 and id_2018 ## find where UniqueID has changed (or not) -## where 2016 is missing from 2018, find nearest polygon and calculate distance +## where 2016 is missing from 2018, find nearest polygon and calculate distance dwr_merged <- terra::intersect(dwr_2018, dwr_2016) |> # Project to WGS84 (decimal degrees) before converting to dataframe terra::project("epsg:4326") |> - as.data.frame(geom = "xy") |> + as.data.frame(geom = "xy") |> select(contains("id"), x, y) |> rename(lon = x, lat = y) |> mutate( @@ -229,9 +231,10 @@ unmatched <- anti_join(design_points, design_points_ids, by = "site_id") |> # Then append to the design_points_ids if (nrow(unmatched) > 0) { unmatched_vect <- terra::vect( - unmatched, geom = c("lon", "lat"), crs = "epsg:4326" + unmatched, + geom = c("lon", "lat"), crs = "epsg:4326" ) |> terra::project("epsg:3310") - + # Calculate distance matrix and find nearest polygons nearest_data <- tibble( site_id = unmatched$site_id, @@ -260,19 +263,22 @@ if (nrow(unmatched) > 0) { PEcAn.logger::logger.info("Design points joined to nearest polygon:") nearest_data |> - knitr::kable(digits = 5) - + knitr::kable(digits = 5) + # Combine intersected points with nearest points design_points_ids_updated <- bind_rows(design_points_ids, nearest_data) } - - -write.csv(design_points_ids_updated |> - select(UniqueID, site_id, lat, lon) |> - left_join(design_points |> - select(site_id, pft), - by = "site_id") |> - arrange(pft, lat, lon), + + +write.csv( + design_points_ids_updated |> + select(UniqueID, site_id, lat, lon) |> + left_join( + design_points |> + select(site_id, pft), + by = "site_id" + ) |> + arrange(pft, lat, lon), here::here("data/design_points.csv"), row.names = FALSE ) @@ -300,7 +306,7 @@ crops_all |> arrange(desc(n_counties)) |> print(n = 15) -### +### grass_fields <- crops_all |> filter(CLASS == "P", MULTIUSE == "S", season == 2) |> @@ -315,7 +321,7 @@ grass_fields <- crops_all |> ids = paste(unique(UniqueID), collapse = ",") ) |> ungroup() |> - sample_n(10) |> + slice_sample(n = 10) |> rename(lat = centy, lon = centx) |> terra::vect(crs = "epsg:3857") |> terra::project("epsg:4269") |> diff --git a/scripts/011_prepare_anchor_sites.R b/scripts/011_prepare_anchor_sites.R index 2929117..602aeec 100644 --- a/scripts/011_prepare_anchor_sites.R +++ b/scripts/011_prepare_anchor_sites.R @@ -23,7 +23,7 @@ p <- anchor_sites_pts |> scale_color_brewer(palette = "Dark2") + labs(color = "PFT") + theme_minimal() -ggsave(p, filename = "figures/anchor_sites.png", dpi = 300, bg = "white") +ggsave_optimized("figures/anchor_sites.webp", plot = p, dpi = 96, bg = "white") #' Match anchor sites to LandIQ fields #' diff --git a/scripts/021_clustering_diagnostics.R b/scripts/021_clustering_diagnostics.R index 120a9eb..a234507 100644 --- a/scripts/021_clustering_diagnostics.R +++ b/scripts/021_clustering_diagnostics.R @@ -7,7 +7,14 @@ ca_climregions <- sf::st_read(file.path(data_dir, "ca_climregions.gpkg")) ######### Cluster Diagnostics ################ -sites_clustered <- readRDS(file.path(cache_dir, "sites_clustered.rds")) +sites_clustered_path <- file.path(cache_dir, "sites_clustered.rds") +if (!file.exists(sites_clustered_path)) { + PEcAn.logger::logger.severe("Expected clustering output not found:", sites_clustered_path) +} +sites_clustered <- readRDS(sites_clustered_path) +if (!("cluster" %in% names(sites_clustered))) { + PEcAn.logger::logger.severe("Clustering object lacks 'cluster' column; check upstream clustering step.") +} # Summarize clusters cluster_summary <- sites_clustered |> dplyr::group_by(cluster) |> @@ -16,19 +23,26 @@ cluster_summary <- sites_clustered |> knitr::kable(cluster_summary, digits = 0) # Plot all pairwise numeric variables -ggpairs_plot <- sites_clustered |> - dplyr::select(-site_id) |> - # need small # pfts for ggpairs - dplyr::sample_n(min(nrow(sites_clustered), 1000)) |> - GGally::ggpairs( - # plot all values except site_id and cluster - columns = setdiff(names(sites_clustered), c("site_id", "cluster")), - mapping = aes(color = as.factor(cluster), alpha = 0.8) - ) + - theme_minimal() -ggsave(ggpairs_plot, - filename = "figures/cluster_pairs.png", - dpi = 300, width = 10, height = 10, units = "in" + +withr::with_seed(42, { + ggpairs_plot <- sites_clustered |> + dplyr::select(-site_id) |> + # need small # pfts for ggpairs + dplyr::slice_sample( + n = min(nrow(sites_clustered), 10000) + ) |> + GGally::ggpairs( + # plot all values except site_id and cluster + columns = setdiff(names(sites_clustered), c("site_id", "cluster")), + mapping = aes(color = as.factor(cluster), alpha = 0.8) + ) + + theme_minimal() +}) + +ggsave_optimized( + "figures/cluster_pairs.webp", + plot = ggpairs_plot, + width = 10, height = 10, units = "in", dpi = 96 ) # scale and reshape to long for plotting @@ -52,7 +66,46 @@ cluster_plot <- ggplot( labs(x = "Variable", y = "Normalized Value") + theme_minimal() -ggsave(cluster_plot, filename = "figures/cluster_plot.png", dpi = 300, bg = "white") +ggsave_optimized("figures/cluster_plot.svg", plot = cluster_plot) + +#' ### Which covariates define the clusters? (Unsupervised VI) +#' +#' We estimate each variable's contribution to cluster separation using the +#' proportion of variance explained by clusters (eta-squared, ): +#' = between-cluster variance / total variance. Higher values indicate +#' variables whose means differ more strongly across clusters. + +# Compute eta-squared () per numeric variable +num_vars <- sites_clustered |> + dplyr::select(where(is.numeric)) |> + names() +num_vars <- setdiff(num_vars, c("cluster")) + +eta2_tbl <- purrr::map_dfr(num_vars, function(vn) { + x <- sites_clustered[[vn]] + cl <- as.factor(sites_clustered$cluster) + m <- mean(x, na.rm = TRUE) + # counts and means by cluster (handle NAs per group) + g_mean <- tapply(x, cl, function(v) mean(v, na.rm = TRUE)) + g_n <- tapply(x, cl, function(v) sum(!is.na(v))) + N <- sum(!is.na(x)) + total <- stats::var(x, na.rm = TRUE) * max(N - 1, 1) + between <- sum(g_n * (g_mean - m)^2, na.rm = TRUE) + eta2 <- ifelse(total > 0, between / total, NA_real_) + tibble::tibble(variable = vn, eta2 = eta2) +}) |> + dplyr::arrange(dplyr::desc(eta2)) + +vi_cluster_plot <- ggplot(eta2_tbl, aes(x = reorder(variable, eta2), y = eta2)) + + geom_col(fill = "steelblue") + + coord_flip() + + labs( + x = "Predictor", y = expression(eta^2 ~ " (between / total variance)"), + title = "K-means cluster separation by predictor ()" + ) + + theme_minimal() + +ggsave_optimized("figures/cluster_variable_importance.svg", plot = vi_cluster_plot) #' #' #### Stratification by Crops and Climate Regions @@ -117,9 +170,6 @@ design_points_clust <- design_points |> dplyr::mutate(cluster = as.factor(cluster)) |> sf::st_as_sf(coords = c("lon", "lat"), crs = 4326) -ca_fields_pts <- ca_fields |> - sf::st_as_sf(coords = c("lon", "lat"), crs = 4326) - design_pt_plot <- ggplot() + geom_sf(data = ca_climregions, fill = "white") + theme_minimal() + @@ -130,4 +180,4 @@ design_pt_plot <- ggplot() + size = 2, stat = "sf_coordinates" ) -ggsave(design_pt_plot, filename = "figures/design_points.png", dpi = 300, bg = "white") +ggsave_optimized("figures/design_points.webp", plot = design_pt_plot, width = 10, height = 6, units = "in", dpi = 96, bg = "white") diff --git a/scripts/030_extract_sipnet_output.R b/scripts/030_extract_sipnet_output.R index 5b818ca..6891300 100644 --- a/scripts/030_extract_sipnet_output.R +++ b/scripts/030_extract_sipnet_output.R @@ -1,5 +1,5 @@ # This file processess the output from SIPNET ensemble runs and generates -# A long format CSV (time, site, ensemble, variable) +# A long format CSV (time, site, ensemble, variable) # that follows the Ecological Forecasting Initiative (EFI) forecast standard # Helper functions in R/efi_long_to_arrays.R will convert this to @@ -7,11 +7,6 @@ # 2. A NetCDF file (time, site, ensemble, variable) # TODO: write out EML metadata in order to be fully EFI compliant - -## First, uncompress the model output -# tar --use-compress-program="pigz -d" -xf ccmmf_phase_2a_DRAFT_output_20250516.tgz -# tar --use-compress-program="pigz -d" -xf ccmmf_phase_2a_DRAFT_output_20250516.tgz --wildcards '*.nc' - ## Second, make sure ccmmf_dir and pecan_outdir are defined in the config file source("000-config.R") PEcAn.logger::logger.info("***Starting SIPNET output extraction***") @@ -80,7 +75,7 @@ site_ids <- site_meta |> ens_ids <- 1:ensemble_size variables <- outputs_to_extract # TODO standardize this name; variables is ambiguous - # but is used by the PEcAn read.output function +# but is used by the PEcAn read.output function if (!PRODUCTION) { ## -----TESTING SUBSET----## @@ -133,7 +128,7 @@ ens_results_raw <- furrr::future_pmap_dfr( # .Bug report: https://github.com/r-quantities/units/issues/409 # Fixed in units version >= 0.8.7 .options = furrr::furrr_options(seed = TRUE) -) +) ens_results <- ens_results_raw |> dplyr::group_by(parameter, base_site_id, pft, year) |> @@ -156,12 +151,12 @@ ens_results <- ens_results_raw |> std_vars <- PEcAn.utils::standard_vars pool_vars <- std_vars |> - dplyr::filter(stringr::str_detect(tolower(Category), "pool")) |> - dplyr::pull(Variable.Name) + dplyr::filter(stringr::str_detect(tolower(Category), "pool")) |> + dplyr::pull(Variable.Name) flux_vars <- std_vars |> - dplyr::filter(stringr::str_detect(tolower(Category), "flux")) |> - dplyr::pull(Variable.Name) + dplyr::filter(stringr::str_detect(tolower(Category), "flux")) |> + dplyr::pull(Variable.Name) ens_results <- ens_results |> dplyr::mutate( @@ -191,7 +186,7 @@ if (any(ens_results$variable_type == "flux")) { # restore logging logger_level <- PEcAn.logger::logger.setLevel(logger_level) - + ensemble_output_csv <- file.path(model_outdir, "ensemble_output.csv") readr::write_csv(ens_results, ensemble_output_csv) PEcAn.logger::logger.info( diff --git a/scripts/031_aggregate_sipnet_output.R b/scripts/031_aggregate_sipnet_output.R index 16554e2..215b4aa 100644 --- a/scripts/031_aggregate_sipnet_output.R +++ b/scripts/031_aggregate_sipnet_output.R @@ -1,5 +1,5 @@ ## Simulate multi-PFT scenarios by aggregating SIPNET output from two PFTs -## Two approaches: +## Two approaches: ## - overlap (e.g. orchard + herbaceous ground cover; geometric overlap) ## - discrete (e.g. annual crop monoculture with hedgerows; partitions area) ## @@ -7,7 +7,7 @@ ## ensemble_output.csv (long) ## site_cover_fractions.csv with columns: ## site_id, year, woody_cover, annual_cover, scenario (scenario ∈ {overlap, discrete}) -## For development, a mock grid of (woody_cover, annual_cover, scenario) is generated and +## For development, a mock grid of (woody_cover, annual_cover, scenario) is generated and ## applied to all sites) ## ## Notation: @@ -17,11 +17,10 @@ source("000-config.R") PEcAn.logger::logger.info("*** Starting multi-PFT aggregation ***") -source(here::here("R", "mixed_aggregation.R")) # ---- Load ensemble output ---------------------------------------------------- ensemble_output_csv <- file.path(model_outdir, "ensemble_output.csv") -ens_output <- readr::read_csv(ensemble_output_csv) |> +ensemble_data <- readr::read_csv(ensemble_output_csv) |> # rename EFI std names for clarity # efi name | new name # parameter | ensemble_id @@ -38,9 +37,9 @@ ens_output <- readr::read_csv(ensemble_output_csv) |> # cover_fractions_csv <- file.path(model_outdir, "site_cover_fractions.csv") # distinct site-year combinations -distinct_site_year <- ens_output |> +distinct_site_year <- ensemble_data |> dplyr::mutate(year = lubridate::year(datetime)) |> - dplyr::distinct(site_id, year) + dplyr::distinct(site_id, year) # Scenarios for development # - 100% woody perennial (orchard / monoculture) @@ -51,7 +50,7 @@ distinct_site_year <- ens_output |> # - Annual crop + 50% woody hedgerows scenarios <- tibble::tibble( annual_cover = c(0, 1, 0.25, 0.5, 0.75, 0.5), - woody_cover = c(1, 0, 1, 1, 0.25, 0.5), + woody_cover = c(1, 0, 1, 1, 0.25, 0.5), mix_description = c( "100% woody", "100% annual", "100% woody + 25% annual", "100% woody + 50% annual", @@ -68,31 +67,30 @@ scenarios <- tibble::tibble( # pfts woody_pft <- "woody perennial crop" annual_pft <- "annual crop" -mixed_overlap_pft <- "woody_annual_overlap_100_50" # new synthetic PFT label +mixed_overlap_pft <- "woody_annual_overlap_100_50" # new synthetic PFT label # ensemble members -ensemble_ids <- unique(ens_output$ensemble_id) +ensemble_ids <- unique(ensemble_data$ensemble_id) # annual_init values for each site x ensemble: value at the earliest datetime -annual_init <- ens_output |> +annual_init <- ensemble_data |> dplyr::filter(pft == "annual crop") |> dplyr::group_by(site_id, variable, ensemble_id) |> dplyr::slice_min(order_by = datetime, n = 1, with_ties = FALSE) |> dplyr::mutate( annual_init = dplyr::case_when( - # TODO: think through this - # we want to add AGB to the ecosystem level value - # for SOC, we only want the diff - # this probably isn't the best place to store this logic - # also, + # For incremental (overlap) mixing, use a baseline for annuals: + # - AGB: baseline ~ 0 at planting; carry forward only the increment. + # - TotSoilCarb (SOC): baseline is the initial SOC stock (value at first timestep). + # baseline SOC is used to compute delta SOC increment variable == "AGB" ~ 0, variable == "TotSoilCarb" ~ value - ) - ) |> + ) + ) |> dplyr::select(site_id, ensemble_id, variable, annual_init) # ---- Reshape ensemble output (wide by PFT) ----------------------------------- -.ens_wide <- ens_output |> +.ens_wide <- ensemble_data |> dplyr::mutate(year = lubridate::year(datetime)) |> dplyr::select( datetime, year, site_id, lat, lon, @@ -101,7 +99,7 @@ annual_init <- ens_output |> tidyr::pivot_wider(names_from = pft, values_from = value) scenarios_x_vars <- scenarios |> - tidyr::crossing(ensemble_id = ensemble_ids) + tidyr::crossing(ensemble_id = ensemble_ids) ens_wide <- .ens_wide |> dplyr::rename( @@ -136,33 +134,49 @@ if (n_bad > 0) { # ---- Check for missing values --------------------------------------- # TBD -if(any(is.na(ens_wide))) { +if (any(is.na(ens_wide))) { PEcAn.logger::logger.severe( "Missing values found in ensemble wide data. Examples:\n" ) head(ens_wide[is.na(ens_wide)], 10) } -# ---- Combine values (row-wise) ---------------------------------------------- +# ---- Combine values (vectorized by method) ---------------------------------- -ens_combined <- ens_wide |> - dplyr::rowwise() |> +# Weighted case (discrete area partition) +ens_weighted <- ens_wide |> + dplyr::filter(method == "weighted") |> dplyr::mutate( - value_combined = combine_value( + value_combined = combine_mixed_crops( woody_value = woody_value, annual_value = annual_value, - annual_init = ifelse(method == "incremental", annual_init, NULL), annual_cover = annual_cover, woody_cover = woody_cover, - method = method + method = "weighted" ) - ) + ) + +# Incremental case (overlap; preserve woody baseline, add annual increment) +ens_incremental <- ens_wide |> + dplyr::filter(method == "incremental") |> + dplyr::mutate( + value_combined = combine_mixed_crops( + woody_value = woody_value, + annual_value = annual_value, + annual_cover = annual_cover, + woody_cover = woody_cover, + annual_init = annual_init, + method = "incremental" + ) + ) + +ens_combined <- dplyr::bind_rows(ens_weighted, ens_incremental) ################################################################################ # ---- Write outputs ----------------------------------------------------------- -# +# ens_wide_csv <- file.path(model_outdir, "multi_pft_ensemble_output.csv") readr::write_csv(ens_wide, ens_wide_csv) PEcAn.logger::logger.info("Wrote wide diagnostic: ", ens_wide_csv) @@ -173,13 +187,18 @@ PEcAn.logger::logger.info("Wrote aggregated output: ", ens_combined_csv) # ---- Create EFI-compliant ensemble file with mixed overlap PFT ---- # Original EFI-style rows (restore EFI column names) -efi_original <- ens_output |> +efi_original <- ensemble_data |> dplyr::rename(parameter = ensemble_id, prediction = value) # Extract the specific overlap scenario: 100% woody, 50% annual (orchard + 50% ground cover) mixed_overlap_rows <- ens_combined |> dplyr::ungroup() |> dplyr::filter( + # Use a single overlap scenario for EFI export: + # orchard with 50% ground cover + # this can be configurable if indicated during calibration / validation. + # This parameter can be calibrated based on observed data, + # and can vary, or not, by location mix_description == "100% woody + 50% annual" ) |> dplyr::select( diff --git a/scripts/040_downscale.R b/scripts/040_downscale.R new file mode 100644 index 0000000..cce3ac9 --- /dev/null +++ b/scripts/040_downscale.R @@ -0,0 +1,898 @@ +# This workflow does the following: +# +# - Use environmental covariates to predict SIPNET estimated SOC for each field in the LandIQ dataset +# - Uses Random Forest [may change to CNN later] trained on site-scale model runs. +# - Build a model for each ensemble member +# - Write out a table with predicted biomass and SOC to maintain ensemble structure, ensuring correct error propagation and spatial covariance. +# - Aggregates County-level biomass and SOC inventories +# +## ----debugging-------------------------------------------------------------------- +# debugonce(PEcAnAssimSequential::ensemble_downscale) +# PEcAn.logger::logger.setQuitOnSevere(TRUE) +# ----setup-------------------------------------------------------------------- + + +source("000-config.R") +PEcAn.logger::logger.info("***Starting Downscaling and Aggregation***") + +#----- load ensemble data ---------------------------------------------------- +ensemble_csv <- file.path(model_outdir, "ensemble_output.csv") +timer_read_ensemble <- step_timer() +ensemble_data <- readr::read_csv(ensemble_csv) |> + dplyr::rename( + ensemble = parameter # parameter is EFI std name for ensemble + ) +PEcAn.logger::logger.info( + "Loaded ensemble data:", nrow(ensemble_data), "rows;", + dplyr::n_distinct(ensemble_data$site_id), "unique site_ids;", + dplyr::n_distinct(ensemble_data$ensemble), "ensembles;", + dplyr::n_distinct(ensemble_data$pft), "PFTs;", + dplyr::n_distinct(ensemble_data$variable), "variables (carbon pools) in file; load_time_s=", + round(step_elapsed(timer_read_ensemble), 2) +) +log_mem("After loading ensemble data :: ") + +ensemble_ids <- ensemble_data |> + dplyr::pull(ensemble) |> + unique() + +start_date <- lubridate::as_date(min(ensemble_data$datetime)) +end_date <- lubridate::as_date(max(ensemble_data$datetime)) + +#--- load ca_fields ------------------------------------------------ +# this is a convenience time saver for development +# cache sf object to avoid repeated reads in interactive sessions. +# TODO: consider memoise::memoise() for production robustness or +# refactor to pass as function argument. +if (!exists("ca_fields_full")) { + ca_fields_full <- sf::read_sf(file.path(data_dir, "ca_fields.gpkg")) +} + +ca_fields <- ca_fields_full |> + sf::st_drop_geometry() |> + dplyr::select(site_id, county, area_ha) + +# Normalize reference table to one row per site_id and warn if duplicates exist +dup_counts <- ca_fields |> + dplyr::count(site_id, name = "n") |> + dplyr::filter(n > 1) +if (nrow(dup_counts) > 0) { + PEcAn.logger::logger.warn( + "ca_fields has duplicate site_id rows: ", nrow(dup_counts), + "; collapsing to first observed county/area per site_id. Examples: ", + paste(utils::head(dup_counts$site_id, 5), collapse = ", ") + ) +} +ca_fields <- ca_fields |> + dplyr::group_by(site_id) |> + dplyr::summarise( + county = dplyr::first(county), + area_ha = dplyr::first(area_ha), + .groups = "drop" + ) + +ca_field_attributes <- readr::read_csv(file.path(data_dir, "ca_field_attributes.csv")) + +# Determine PFTs and map ensemble keys (e.g. 'woody') to field labels +ensemble_pfts <- sort(unique(ensemble_data$pft)) +field_pfts <- sort(unique(ca_field_attributes$pft)) + +# map each ensemble key to itself (each key acts as its own label) +pfts <- intersect(ensemble_pfts, field_pfts) + +if (length(pfts) == 0) { + PEcAn.logger::logger.error("No overlapping PFTs between ensemble data and field attributes") +} else { + PEcAn.logger::logger.info("Downscaling will be performed for these PFTs:", paste(pfts, collapse = ", ")) +} + +#--- load site covariates +covariates_csv <- file.path(data_dir, "site_covariates.csv") +timer_read_cov <- step_timer() +covariates <- readr::read_csv(covariates_csv) |> + dplyr::select( + site_id, where(is.numeric), + -climregion_id + ) +PEcAn.logger::logger.info( + "Loaded covariates:", nrow(covariates), "sites x", ncol(covariates) - 1, "numeric predictors; load_time_s=", + round(step_elapsed(timer_read_cov), 2) +) +log_mem("After loading covariates :: ") + +covariate_names <- names(covariates |> + dplyr::select(where(is.numeric))) + +PEcAn.logger::logger.info( + "Downscaling will use these covariates:\n\n", + paste(covariate_names, collapse = ", ") +) + +# ---- variable-importance helpers ------------------------------------------- +safe_sanitize <- function(x) { + gsub("[^A-Za-z0-9]+", "_", x) +} + +extract_vi <- function(model) { + # Supports randomForest and ranger models + if (inherits(model, "randomForest")) { + vi <- tryCatch(randomForest::importance(model), error = function(e) NULL) + if (is.null(vi)) { + return(NULL) + } + if ("%IncMSE" %in% colnames(vi)) as.numeric(vi[, "%IncMSE"]) else as.numeric(vi[, 1]) + } else if (inherits(model, "ranger")) { + vi <- tryCatch(model$variable.importance, error = function(e) NULL) + if (is.null(vi)) { + return(NULL) + } + as.numeric(vi) + } else { + NULL + } +} + +extract_vi_names <- function(model) { + if (inherits(model, "randomForest")) { + vi <- tryCatch(randomForest::importance(model), error = function(e) NULL) + if (is.null(vi)) { + return(NULL) + } + rownames(vi) + } else if (inherits(model, "ranger")) { + nms <- tryCatch(names(model$variable.importance), error = function(e) NULL) + nms + } else { + NULL + } +} + +extract_oob_r2 <- function(model, y_train = NULL) { + if (inherits(model, "randomForest")) { + if (!is.null(model$predicted) && !is.null(model$y)) { + y <- model$y + yhat <- model$predicted + if (length(y) == length(yhat) && stats::var(y) > 0) { + return(1 - sum((y - yhat)^2) / sum((y - mean(y))^2)) + } + } + return(NA_real_) + } + if (inherits(model, "ranger")) { + mse_oob <- tryCatch(model$prediction.error, error = function(e) NA_real_) + if (!is.na(mse_oob)) { + if (is.null(y_train)) { + return(NA_real_) + } + v <- stats::var(y_train) + if (!is.finite(v) || v <= 0) { + return(NA_real_) + } + return(1 - mse_oob / v) + } + } + NA_real_ +} + +# ----define design points based on ensemble data------------------------------- +# TODO: move this sanitization upstream to when ens data is created (030_extract_sipnet_output.R) +# or better ... figure out why we so often run into mis-match!!! +# at least this time the missing site_ids all had matches within a few micrometers (10^-6 m) + +# Check for missing design points in covariates and match by proximity if needed +# required_dp_cols <- c("site_id", "lat", "lon", "pft") +# .design_points <- ensemble_data |> +# dplyr::select(dplyr::any_of(required_dp_cols)) |> +# dplyr::distinct() + +# design_points <- update_design_point_site_ids( +# .design_points, +# ca_field_attributes +# ) + +# TODO: Need to put a canonical design_points CSV in repository +### FOR NOW, just use hard coded design points +design_points <- structure(list(site_id = c( + "3a84c0268e1655a3", "3a84c0268e1655a3", + "d523652b399a8f6e", "d523652b399a8f6e", "275102c035b15f5e", "275102c035b15f5e", + "26ff9e8246f7c8f4", "26ff9e8246f7c8f4", "47cd11223bb49112", "47cd11223bb49112", + "9a4c7e47fc0297bb", "9a4c7e47fc0297bb", "e5bb4dca46bd5041", "e5bb4dca46bd5041", + "abd5a71d492e92e1", "abd5a71d492e92e1", "7fe5bb855fb36cdb", "7fe5bb855fb36cdb", + "7bb77bae6ac3c147", "7bb77bae6ac3c147" +), lat = c( + 34.91295, 34.91295, + 34.38596, 34.38596, 34.47244, 34.47244, 33.86884, 33.86884, 34.29708, + 34.29708, 33.96727, 33.96727, 33.35306, 33.35306, 34.37258, 34.37258, + 33.90119, 33.90119, 33.57847, 33.57847 +), lon = c( + -120.40345, + -120.40345, -118.81446, -118.81446, -119.22015, -119.22015, -117.40838, + -117.40838, -119.06014, -119.06014, -117.34049, -117.34049, -117.19182, + -117.19182, -119.03318, -119.03318, -117.40624, -117.40624, -116.03157, + -116.03157 +), pft = c( + "woody perennial crop", "annual crop", "woody perennial crop", "annual crop", "woody perennial crop", + "annual crop", "woody perennial crop", "annual crop", "woody perennial crop", "annual crop", "woody perennial crop", "annual crop", + "woody perennial crop", "annual crop", "woody perennial crop", "annual crop", "woody perennial crop", "annual crop", "woody perennial crop", + "annual crop" +)), row.names = c(NA, -20L), class = c( + "tbl_df", "tbl", + "data.frame" +)) + +stopifnot(all(design_points$site_id %in% covariates$site_id)) + +# should we ever get here? +if (!all(design_points$site_id %in% ensemble_data$site_id)) { + ensemble_data2 <- ensemble_data |> + dplyr::left_join(design_points, by = c("lat", "lon", "pft"), suffix = c("", ".dp")) |> + dplyr::mutate(site_id = site_id.dp) |> + dplyr::select(-site_id.dp) + n_missing <- setdiff(design_points$site_id, ensemble_data2$site_id) |> + length() + + if (n_missing > 0) { + PEcAn.logger::logger.error( + n_missing, "design points still missing from ensemble data after matching", + "this is already a hack, time to sort it out upstream!" + ) + } + ensemble_data <- ensemble_data2 +} +stopifnot(any(design_points$site_id %in% ensemble_data$site_id)) + + +# Scaled numeric design covariates for model diagnostics/plots +# Keep an unscaled copy of design covariates for prediction inputs +design_covariates_unscaled <- design_points |> + dplyr::left_join(covariates, by = "site_id") |> + dplyr::select(site_id, dplyr::all_of(covariate_names)) |> + as.data.frame() + +# Scaled numeric design covariates for model diagnostics/plots +design_covariates <- design_covariates_unscaled |> + # randomForest pkg requires data frame + as.data.frame() |> + # scale covariates as for consistency with model + dplyr::mutate(dplyr::across(dplyr::all_of(covariate_names), scale)) + +# Check again to ensure we've resolved the issue +n_not_in_covariates_after <- setdiff(design_points$site_id, covariates$site_id) |> + length() +if (n_not_in_covariates_after > 0) { + PEcAn.logger::logger.error(n_not_in_covariates_after, "design points still missing covariate data after matching") +} + +all(design_points$site_id %in% covariates$site_id) + +# Keep full covariates and perform per-PFT sampling later (dev mode) +covariates_full <- covariates +if (!PRODUCTION) { + if (!exists(".Random.seed")) set.seed(123) + PEcAn.logger::logger.info("Development mode: will sample up to 10k prediction sites per PFT") + # keep ca_field_attributes consistent with available covariates + ca_field_attributes <- ca_field_attributes |> + dplyr::filter(site_id %in% covariates_full$site_id) +} + +# Build list of site_ids per PFT from field attributes +pft_site_ids <- ca_field_attributes |> + dplyr::filter(pft %in% pfts) |> + dplyr::distinct(site_id, pft) + +sites_info <- pft_site_ids |> + dplyr::group_by(pft) |> + dplyr::summarise(n_sites = dplyr::n(), .groups = "drop") +PEcAn.logger::logger.info( + "Sites per PFT:", + paste(paste0(sites_info$pft, "=", sites_info$n_sites), collapse = "; ") +) +log_mem("After computing sites per PFT :: ") + +#### Target sites: per-PFT site lists built above (pft_site_ids) + +## Wrapper to downscale a single carbon pool with explicit training set and target sites +# TODO refactor to to downscale_ensemble_output() +downscale_model_output <- function(date, + model_output, + train_ensemble_data, + train_site_coords = design_points, + pred_covariates = covariates) { + # Ensure training site coords only include sites present in the ensemble slice + # Restrict training coordinates to those present in the ensemble data + ens_sites <- unique(train_ensemble_data$site_id) + train_site_coords <- train_site_coords[train_site_coords$site_id %in% ens_sites, , drop = FALSE] + + if (nrow(train_site_coords) == 0) { + PEcAn.logger::logger.warn("No overlapping training sites between site_coords and ensemble data for pool", model_output) + return(NULL) + } + + filtered_ens_data <- PEcAnAssimSequential::subset_ensemble( + ensemble_data = train_ensemble_data, + site_coords = train_site_coords, + date = date, + carbon_pool = model_output + ) + if (is.null(filtered_ens_data) || nrow(filtered_ens_data) == 0) { + PEcAn.logger::logger.warn("Filtered ensemble data is empty for pool", model_output) + return(NULL) + } + + ## BEGIN HACK FOR SMALL-N + # n_unique_sites <- dplyr::n_distinct(train_site_coords$site_id) + # if (n_unique_sites <= 12) { + # source(here::here("R", "temporary_hack_fit_predict_small_n.R")) + # PEcAn.logger::logger.info("Small-N mode: training per-ensemble RF with nodesize=1 and no test split") + # downscale_output <- fit_predict_small_n( + # filtered_ens = filtered_ens_data, + # pred_covariates = pred_covariates, + # covariate_names = covariate_names, # already defined earlier in 040 + # nodesize = 1, + # ntree = 1000 + # ) + # } else { + ## END HACK FOR SMALL-N + # Downscale the data + downscale_output <- + PEcAnAssimSequential::ensemble_downscale( + ensemble_data = filtered_ens_data, + site_coords = train_site_coords, + covariates = pred_covariates + ) + # } ## REMOVE WITH HACK FOR SMALL-N + if (is.null(downscale_output)) { + return(NULL) + } + + # Attach the site_ids used for prediction to keep mapping explicit + downscale_output$site_ids <- pred_covariates$site_id + return(downscale_output) +} + +# not using furrr b/c it is used inside downscale +# We downscale each carbon pool for both woody and annual PFTs, +# predicting to the same target set for that PFT +downscale_output_list <- list() +delta_output_records <- list() +training_sites_records <- list() +combo_total <- length(outputs_to_extract) * length(pfts) +combo_index <- 0L +loop_global_timer <- step_timer() +for (pool in outputs_to_extract) { + for (pft_i in pfts) { + combo_index <- combo_index + 1L + iter_timer <- step_timer() + PEcAn.logger::logger.info( + sprintf( + "[Progress %d/%d] Starting downscaling for %s (%s) at %s", + combo_index, combo_total, pool, pft_i, ts_now() + ) + ) + + # train_ens: ensemble data filtered by PFT + train_ens <- ensemble_data |> + dplyr::filter(pft == pft_i & variable == pool) + + # Skip empty slices early + if (nrow(train_ens) == 0) { + PEcAn.logger::logger.warn("No ensemble rows for ", pft_i, "::", pool, " skipping") + next + } + + # Determine per-slice end date and warn if ensembles disagree + slice_end_date <- as.Date(max(train_ens$datetime)) + end_by_ens <- train_ens |> + dplyr::group_by(ensemble) |> + dplyr::summarise(last_date = max(lubridate::as_date(datetime)), .groups = "drop") + if (dplyr::n_distinct(end_by_ens$last_date) > 1) { + PEcAn.logger::logger.warn( + "End dates vary across ensembles for ", pft_i, "::", pool, + "; using slice_end_date=", as.character(slice_end_date) + ) + } else { + PEcAn.logger::logger.info( + "Using slice_end_date=", as.character(slice_end_date), " for ", pft_i, "::", pool + ) + } + # train_pts: design points filtered by PFT + train_pts <- train_ens |> + dplyr::select(site_id, lat, lon, pft) |> + dplyr::distinct() + + # Diagnostic: overlapping site counts + n_train_ens_sites <- length(unique(train_ens$site_id)) + n_train_pts <- nrow(train_pts) + PEcAn.logger::logger.info("Training sites: ensemble has", n_train_ens_sites, "site_ids; using", n_train_pts, "coords") + training_sites_records[[paste0(pft_i, "::", pool)]] <- + tibble::tibble(site_id = unique(train_pts$site_id), pft = pft_i, model_output = pool) + + # Ensure design point covariates are included for training join + dp_pft <- design_covariates_unscaled |> + dplyr::filter(site_id %in% (design_points |> + dplyr::filter(pft == pft_i) |> + dplyr::pull(site_id))) + # prediction covariates: either full set for that PFT (prod) or sampled subset (dev) + if (PRODUCTION) { + pred_covs <- covariates_full |> + dplyr::filter(site_id %in% (pft_site_ids |> dplyr::filter(pft == pft_i) |> dplyr::pull(site_id))) |> + dplyr::bind_rows(dp_pft) |> + dplyr::distinct(site_id, .keep_all = TRUE) + } else { + sample_pool <- covariates_full |> + dplyr::filter(site_id %in% pft_site_ids[[pft_i]]) |> + dplyr::anti_join(dp_pft, by = "site_id") + + n_sample <- min(10000, nrow(sample_pool)) + sampled <- if (n_sample > 0) { + sample_pool |> + dplyr::slice_sample(n = n_sample) + } else { + sample_pool + } + + pred_covs <- dplyr::bind_rows(sampled, dp_pft) + } + + # Guard for empty prediction covariates (both development mode and production) + if (nrow(pred_covs) == 0) { + PEcAn.logger::logger.warn("No prediction covariates for PFT:", pft_i, " pool:", pool, " skipping") + next + } + + # Sanity: ensure all training site_ids exist in prediction covariates + missing_train_cov <- setdiff(unique(train_pts$site_id), pred_covs$site_id) + if (length(missing_train_cov) > 0) { + PEcAn.logger::logger.error( + "Missing covariates for training site_ids (", length(missing_train_cov), ") in PFT ", pft_i, + ": ", paste(utils::head(missing_train_cov, 10), collapse = ", "), + if (length(missing_train_cov) > 10) " ..." else "" + ) + } + + call_timer <- step_timer() + result <- downscale_model_output( + date = slice_end_date, + model_output = pool, + train_ensemble_data = train_ens, + train_site_coords = train_pts, + pred_covariates = pred_covs + ) + PEcAn.logger::logger.info( + "Completed downscaling for", pool, "(", pft_i, ") in", + round(step_elapsed(call_timer), 2), "s; n_pred_sites=", nrow(pred_covs), + " n_train_sites=", n_train_pts + ) + log_mem(paste0("After downscaling ", pool, " (", pft_i, ") :: ")) + # Also compute start-date predictions to enable delta maps + start_obj <- downscale_model_output( + date = start_date, + model_output = pool, + train_ensemble_data = train_ens, + train_site_coords = train_pts, + pred_covariates = pred_covs + ) + + # Save models and VI metrics if available from downscale outputs + if (!is.null(result) && !is.null(result$model)) { + models_dir <- file.path(cache_dir, "models") + train_dir <- file.path(cache_dir, "training_data") + if (!dir.exists(models_dir)) dir.create(models_dir, recursive = TRUE, showWarnings = FALSE) + if (!dir.exists(train_dir)) dir.create(train_dir, recursive = TRUE, showWarnings = FALSE) + + spec_key <- paste0(janitor::make_clean_names(pft_i), "_", janitor::make_clean_names(pool)) + saveRDS(result$model, file = file.path(models_dir, paste0(spec_key, "_models.rds"))) + + # Write the explicit training covariate matrix for the sites used to fit the model + tr_covs <- dp_pft |> + dplyr::semi_join(train_pts, by = "site_id") |> + dplyr::select(site_id, dplyr::all_of(covariate_names)) + tr_path <- file.path(train_dir, paste0(spec_key, "_training.csv")) + readr::write_csv(tr_covs, tr_path) + + ens_labels <- names(result$predictions) + if (is.null(ens_labels)) ens_labels <- as.character(seq_along(result$predictions)) + vi_rows <- list() + for (mi in seq_along(result$model)) { + mdl <- result$model[[mi]] + vi_vals <- extract_vi(mdl) + vi_nms <- extract_vi_names(mdl) + if (is.null(vi_vals) || is.null(vi_nms)) next + y_train <- NULL + if (!is.null(result$data) && !is.null(result$data$training)) { + tr <- result$data$training + if ("ensemble" %in% names(tr) && "prediction" %in% names(tr)) { + y_train <- tr$prediction[tr$ensemble == ens_labels[mi]] + } + } else if (!is.null(mdl$y)) { + y_train <- mdl$y + } + # TODO OOB r2 is within-training only (bootstrap from same sites) + # Add spatial cross-validation (e.g., leave-one-site-out or spatial + # blocking). + r2_oob <- extract_oob_r2(mdl, y_train) + vi_rows[[length(vi_rows) + 1L]] <- tibble::tibble( + pft = pft_i, + model_output = pool, + ensemble = ens_labels[mi], + predictor = vi_nms, + importance = as.numeric(vi_vals[seq_along(vi_nms)]), + oob_r2 = r2_oob + ) + } + if (length(vi_rows) > 0) { + vi_tbl <- dplyr::bind_rows(vi_rows) + out_vi_per_ens <- file.path(model_outdir, paste0("vi_", spec_key, "_by_ensemble.csv")) + readr::write_csv(vi_tbl, out_vi_per_ens) + } + } + + # store using pft::pool names (e.g. "woody::AGB"). + downscale_output_list[[paste0(pft_i, "::", pool)]] <- result + if (!is.null(result) && !is.null(start_obj)) { + end_df <- purrr::map( + result$predictions, + ~ tibble::tibble(site_id = result$site_ids, prediction = .x) + ) |> + dplyr::bind_rows(.id = "ensemble") |> + dplyr::rename(end_pred = prediction) + start_df <- purrr::map( + start_obj$predictions, + ~ tibble::tibble(site_id = start_obj$site_ids, prediction = .x) + ) |> + dplyr::bind_rows(.id = "ensemble") |> + dplyr::rename(start_pred = prediction) + delta_df <- end_df |> + dplyr::inner_join(start_df, by = c("site_id", "ensemble")) |> + dplyr::mutate(delta_pred = end_pred - start_pred) |> + dplyr::left_join(ca_fields, by = "site_id") |> + dplyr::mutate( + pft = pft_i, + model_output = pool, + delta_c_density_Mg_ha = PEcAn.utils::ud_convert(delta_pred, "kg/m2", "Mg/ha"), + delta_total_c_Mg = delta_c_density_Mg_ha * area_ha + ) |> + dplyr::select(site_id, pft, ensemble, delta_c_density_Mg_ha, delta_total_c_Mg, area_ha, county, model_output) + delta_output_records[[paste0(pft_i, "::", pool)]] <- delta_df + } + + # Incremental checkpoint (so production runs can be resumed) + if (!is.null(result)) { + tryCatch( + { + saveRDS(downscale_output_list, file = file.path(cache_dir, "downscale_partial.rds")) + }, + error = function(e) { + PEcAn.logger::logger.warn("Failed to write partial checkpoint: ", conditionMessage(e)) + } + ) + } + PEcAn.logger::logger.info( + sprintf( + "[Progress %d/%d] Finished %s (%s); iter_time_s=%.2f; elapsed_total_s=%.2f", + combo_index, combo_total, pool, pft_i, + step_elapsed(iter_timer), step_elapsed(loop_global_timer) + ) + ) + } +} + +if (length(downscale_output_list) == 0) { + PEcAn.logger::logger.severe("No downscale outputs produced") +} +PEcAn.logger::logger.info( + "Downscaling loop complete; total_elapsed_s=", + round(step_elapsed(loop_global_timer), 2) +) +log_mem("Post primary downscaling loop :: ") + +PEcAn.logger::logger.info( + "Finished downscaling.\nCongratulations! You are almost there.\n" +) + +### --- Print Metrics for Each Ensemble Member ---#### + +PEcAn.logger::logger.info("Downscaling model results for each ensemble member:") +metrics_timer <- step_timer() +metrics <- lapply(downscale_output_list, PEcAnAssimSequential::downscale_metrics) + +median_metrics <- purrr::map(metrics, function(m) { + m |> + dplyr::select(-ensemble) |> + dplyr::summarise( # do equivalent of colmeans but for medians + dplyr::across( + .cols = dplyr::everything(), + .fns = list(median = ~ median(.x)), + .names = "{col}" + ) + ) +}) + +PEcAn.logger::logger.info("Median downscaling model metrics:") +dplyr::bind_rows(median_metrics, .id = "model_output") |> + knitr::kable() +PEcAn.logger::logger.info( + "Computed median metrics in", round(step_elapsed(metrics_timer), 2), "s" +) + +if (!PRODUCTION) { + # For testing, use a subset of fields + # could be even faster if we queried from gpkg: + # sf::read_sf(..., sql = "SELECT * FROM ca_fields WHERE site_id IN (...)") + ca_fields <- ca_fields |> + dplyr::right_join(covariates, by = "site_id") +} + +# Convert list to table with predictions and site identifier +# Helper: Convert a single downscale object to tidy predictions table +get_downscale_preds <- function(downscale_obj) { + purrr::map( + downscale_obj$predictions, + ~ tibble::tibble(site_id = downscale_obj$site_ids, prediction = .x) + ) |> + dplyr::bind_rows(.id = "ensemble") |> + dplyr::left_join(ca_fields, by = "site_id") +} + +# Assemble predictions; carry PFT label by parsing element name: "{pft}::{pool}" +downscale_preds <- purrr::map(downscale_output_list, get_downscale_preds) |> + dplyr::bind_rows(.id = "spec") |> + tidyr::separate( + col = "spec", + into = c("pft", "model_output"), + sep = "::", + remove = TRUE + ) |> + # Convert kg/m2 to Mg/ha using PEcAn.utils::ud_convert + dplyr::mutate(c_density_Mg_ha = PEcAn.utils::ud_convert(prediction, "kg/m2", "Mg/ha")) |> + # Calculate total Mg per field: c_density_Mg_ha * area_ha + dplyr::mutate(total_c_Mg = c_density_Mg_ha * area_ha) |> + dplyr::select(site_id, pft, ensemble, c_density_Mg_ha, total_c_Mg, area_ha, county, model_output, -prediction) + +dp <- downscale_preds |> + dplyr::select( + site_id, pft, ensemble, + c_density_Mg_ha, total_c_Mg, + area_ha, county, model_output + ) + +## --- Mixed scenario: orchard overlap with 50% grass on woody fields --- ## +# Goal: add an additional PFT record "woody + annual" computed as: +# combined = woody_value + f_annual * (annual_end - annual_start) +# where values are in kg/m2 and f_annual = 0.5. + +# Identify labels used for woody and annual PFTs +# TODO use lookup table to map crops --> pfts and get labels +woody_label <- pfts[grepl("woody", pfts, ignore.case = TRUE)] +annual_label <- pfts[grepl("annual", pfts, ignore.case = TRUE)] + +if (is.na(woody_label) | is.na(annual_label)) { + PEcAn.logger::logger.warn("Cannot build mixed scenario: missing woody or annual PFT") +} else { + PEcAn.logger::logger.info( + "Building mixed scenario 'woody + annual' using overlap (incremental) with 50% annual cover" + ) +} + +# Helper to tidy a downscale object to site_id/ensemble/prediction (kg/m2) +tidy_downscale <- function(ds) { + purrr::map( + ds$predictions, + ~ tibble::tibble(site_id = ds$site_ids, prediction = .x) + ) |> + dplyr::bind_rows(.id = "ensemble") +} + +# Determine target woody sites that exist in current dp +target_woody_sites <- dp |> + dplyr::filter(pft == woody_label) |> + dplyr::distinct(site_id) |> + dplyr::pull(site_id) + +# If no woody sites present, skip +if (length(target_woody_sites) == 0) { + PEcAn.logger::logger.warn("No woody sites found in downscaled predictions; skipping mixed scenario") +} else { + # Build covariates for predicting annual onto woody sites, ensuring + # design-point covariates for the annual PFT are available for training join + dp_annual <- design_covariates_unscaled |> + dplyr::filter(site_id %in% (design_points |> + dplyr::filter(pft == annual_label) |> + dplyr::pull(site_id))) + + pred_cov_mixed <- covariates_full |> + dplyr::filter(site_id %in% target_woody_sites) |> + dplyr::bind_rows(dp_annual) |> + dplyr::distinct(site_id, .keep_all = TRUE) + + mixed_records <- list() + + for (pool in outputs_to_extract) { + # Training data for annual + train_ens_annual <- ensemble_data |> + dplyr::filter(pft == annual_label & variable == pool) + + if (nrow(train_ens_annual) == 0) { + PEcAn.logger::logger.warn("No annual ensemble data for pool ", pool, "; skipping mixed for this pool") + next + } + + train_pts_annual <- train_ens_annual |> + dplyr::select(site_id, lat, lon, pft) |> + dplyr::distinct() + + # Annual predictions at start and end dates on woody sites + annual_start_obj <- downscale_model_output( + date = start_date, + model_output = pool, + train_ensemble_data = train_ens_annual, + train_site_coords = train_pts_annual, + pred_covariates = pred_cov_mixed + ) + annual_end_obj <- downscale_model_output( + date = end_date, + model_output = pool, + train_ensemble_data = train_ens_annual, + train_site_coords = train_pts_annual, + pred_covariates = pred_cov_mixed + ) + + # Get woody predictions at end date from existing results + woody_key <- paste0(woody_label, "::", pool) + woody_obj <- downscale_output_list[[woody_key]] + + if (is.null(annual_start_obj) || is.null(annual_end_obj) || is.null(woody_obj)) { + PEcAn.logger::logger.warn("Missing components for mixed scenario in pool ", pool, "; skipping") + next + } + + woody_df <- tidy_downscale(woody_obj) |> + dplyr::filter(site_id %in% target_woody_sites) |> + dplyr::rename(woody_pred = prediction) + + ann_start_df <- tidy_downscale(annual_start_obj) |> + dplyr::filter(site_id %in% target_woody_sites) |> + dplyr::rename(annual_start = prediction) + + ann_end_df <- tidy_downscale(annual_end_obj) |> + dplyr::filter(site_id %in% target_woody_sites) |> + dplyr::rename(annual_end = prediction) + + # Join by site_id and ensemble to align predictions (include annual_start for SOC) + mix_df <- woody_df |> + dplyr::inner_join(ann_end_df, by = c("site_id", "ensemble")) |> + dplyr::inner_join(ann_start_df, by = c("site_id", "ensemble")) + + if (nrow(mix_df) == 0) { + PEcAn.logger::logger.warn("No overlapping site/ensemble rows for mixed scenario in pool ", pool) + next + } + + f_annual <- 0.5 # TODO: will come from monitoring / scenario data later + mix_df <- mix_df |> + dplyr::mutate( + mixed_pred = combine_mixed_crops( + woody_value = .data$woody_pred, + annual_value = .data$annual_end, + annual_init = if (pool == "AGB") 0 else .data$annual_start, + annual_cover = f_annual, + woody_cover = 1.0, + method = "incremental" + ) + ) |> + # add area/county for totals + dplyr::left_join(ca_fields, by = "site_id") |> + dplyr::mutate( + pft = "woody + annual", + model_output = pool, + c_density_Mg_ha = PEcAn.utils::ud_convert(mixed_pred, "kg/m2", "Mg/ha"), + total_c_Mg = c_density_Mg_ha * area_ha + ) |> + dplyr::select(site_id, pft, ensemble, c_density_Mg_ha, total_c_Mg, area_ha, county, model_output) + + mixed_records[[pool]] <- mix_df + + # Also save per-site treatment scenarios on woody fields for comparisons + woody_scn <- woody_df |> + dplyr::left_join(ca_fields, by = "site_id") |> + dplyr::mutate( + pft = pft_i, + model_output = pool, + scenario = "woody_100", + c_density_Mg_ha = PEcAn.utils::ud_convert(woody_pred, "kg/m2", "Mg/ha"), + total_c_Mg = c_density_Mg_ha * area_ha + ) |> + dplyr::select(site_id, pft, ensemble, scenario, c_density_Mg_ha, total_c_Mg, area_ha, county, model_output) + + annual_scn <- ann_end_df |> + dplyr::left_join(ca_fields, by = "site_id") |> + dplyr::mutate( + pft = pft_i, + model_output = pool, + scenario = "annual_100", + c_density_Mg_ha = PEcAn.utils::ud_convert(annual_end, "kg/m2", "Mg/ha"), + total_c_Mg = c_density_Mg_ha * area_ha + ) |> + dplyr::select(site_id, pft, ensemble, scenario, c_density_Mg_ha, total_c_Mg, area_ha, county, model_output) + + mixed_scn <- mix_df |> + dplyr::mutate(scenario = "woody_50_annual_50") |> + dplyr::select(site_id, pft, ensemble, scenario, c_density_Mg_ha, total_c_Mg, area_ha, county, model_output) + + # accumulate + if (!exists("treatment_records", inherits = FALSE)) treatment_records <- list() + treatment_records[[length(treatment_records) + 1L]] <- dplyr::bind_rows(woody_scn, annual_scn, mixed_scn) + } + + # Append mixed records if any + if (length(mixed_records) > 0) { + mixed_df_all <- dplyr::bind_rows(mixed_records, .id = "pool") |> + dplyr::select(-pool) + dp <- dplyr::bind_rows(dp, mixed_df_all) + } +} + + +## Write out downscaled predictions + +readr::write_csv( + dp, # downscale predictions with mixed scenario appended (if available) + file.path(model_outdir, "downscaled_preds.csv") +) + +# Write training site IDs used for each spec (pft x pool) +if (length(training_sites_records) > 0) { + train_sites_df <- dplyr::bind_rows(training_sites_records) |> + dplyr::distinct() + readr::write_csv(train_sites_df, file.path(model_outdir, "training_sites.csv")) + PEcAn.logger::logger.info("Training site list written to", file.path(model_outdir, "training_sites.csv")) +} +metadata <- list( + title = "Downscaled SIPNET Outputs", + description = "SIPNET model outputs downscaled to field level using Random Forest", + created = Sys.time(), + ensembles = sort(unique(as.integer(ensemble_ids))), + pfts = pfts, + outputs_to_extract = outputs_to_extract, + start_date = as.character(start_date), + end_date = as.character(end_date), + mixed_cover_fraction = 0.5, + columns = list( + site_id = "Unique identifier for each field from LandIQ", + pft = "Plant functional type", + ensemble = "Ensemble member identifier", + c_density_Mg_ha = "Predicted carbon density (Mg/ha)", + total_c_Mg = "Predicted total carbon (Mg) per field", + area_ha = "Field area in hectares", + county = "California county name where the field is located", + model_output = "Type of SIPNET model output (e.g., AGB, TotSoilCarb)" + ) +) + +metadata |> + jsonlite::write_json( + file.path(model_outdir, "downscaled_preds_metadata.json"), + pretty = TRUE, auto_unbox = TRUE + ) + +if (length(delta_output_records) > 0) { + delta_dp <- dplyr::bind_rows(delta_output_records, .id = "spec") |> + tidyr::separate(col = "spec", into = c("pft", "model_output"), sep = "::", remove = TRUE) + readr::write_csv(delta_dp, file.path(model_outdir, "downscaled_deltas.csv")) + PEcAn.logger::logger.info("Delta predictions written to", file.path(model_outdir, "downscaled_deltas.csv")) +} + +# Write treatment comparisons for woody sites if available +if (exists("treatment_records") && length(treatment_records) > 0) { + treatments_df <- dplyr::bind_rows(treatment_records) + out_treat <- file.path(model_outdir, "treatments_woody_sites.csv") + readr::write_csv(treatments_df, out_treat) + PEcAn.logger::logger.info("Treatment scenarios written to", out_treat) +} + +PEcAn.logger::logger.info("Downscaled predictions written to", file.path(model_outdir, "downscaled_preds.csv")) +PEcAn.logger::logger.info( + "Total script elapsed time (s):", + round(step_elapsed(overall_timer), 2) +) +log_mem("End of script :: ") diff --git a/scripts/040_downscale_and_aggregate.R b/scripts/040_downscale_and_aggregate.R deleted file mode 100644 index 2a31459..0000000 --- a/scripts/040_downscale_and_aggregate.R +++ /dev/null @@ -1,278 +0,0 @@ -# This workflow does the following: -# -# - Use environmental covariates to predict SIPNET estimated SOC for each field in the LandIQ dataset -# - Uses Random Forest [may change to CNN later] trained on site-scale model runs. -# - Build a model for each ensemble member -# - Write out a table with predicted biomass and SOC to maintain ensemble structure, ensuring correct error propagation and spatial covariance. -# - Aggregates County-level biomass and SOC inventories -# -## ----setup-------------------------------------------------------------------- - -source("000-config.R") -PEcAn.logger::logger.info("***Starting Downscaling and Aggregation***") -# library(tidyverse) -# library(sf) -# library(terra) -# library(furrr) -library(patchwork) # for combining plots - -ensemble_file_base <- file.path(model_outdir, "ensemble_output.csv") -ensemble_file_mixed <- file.path(model_outdir, "ensemble_output_with_mixed.csv") - -if (file.exists(ensemble_file_mixed)) { - PEcAn.logger::logger.info("Using mixed ensemble file (includes overlap synthetic PFT): ", ensemble_file_mixed) - ensemble_file <- ensemble_file_mixed -} else { - PEcAn.logger::logger.info("Mixed ensemble file not found; using base ensemble file: ", ensemble_file_base) - ensemble_file <- ensemble_file_base -} - -ensemble_data <- readr::read_csv(ensemble_file) |> - dplyr::rename( - ensemble = parameter # parameter is EFI std name for ensemble - # should decide if we want to change downstream code to use parameter - # including PECAnAssimSequential::subset_ensemble - ) - -# Optional: log available PFTs -PEcAn.logger::logger.info("PFTs in ensemble input: ", paste(sort(unique(ensemble_data$pft)), collapse = ", ")) - -x <- readr::read_csv(file.path(model_outdir, "combined_ensemble_output.csv")) - - -ensemble_ids <- ensemble_data |> - dplyr::pull(ensemble) |> - unique() - -start_date <- format(as.Date(min(ensemble_data$datetime)), "%Y-%m-%d") -end_date <- format(as.Date(max(ensemble_data$datetime)), "%Y-%m-%d") - -### Random Forest using PEcAn downscale workflow -## ----------------------------------------------------------------------------- -design_points <- ensemble_data |> - dplyr::select(site_id, lat, lon, pft) |> - dplyr::distinct() - -covariates_csv <- file.path(data_dir, "site_covariates.csv") -covariates <- readr::read_csv(covariates_csv) |> - dplyr::select( - site_id, where(is.numeric), - -climregion_id - ) - -covariate_names <- names(covariates |> - dplyr::select(where(is.numeric))) - -design_covariates <- design_points |> - dplyr::left_join(covariates, by = "site_id") |> - dplyr::select(site_id, dplyr::all_of(covariate_names)) |> - # randomForest pkg requires data frame - as.data.frame() |> - # scale covariates as for consistency with model - dplyr::mutate(dplyr::across(dplyr::all_of(covariate_names), scale)) - - -if (length(setdiff(design_points$site_id, unique(covariates$site_id))) > 0) { - PEcAn.logger::logger.error("Design points not in covariates:", length(not_in_covariates)) -} - -# Subset data for testing / speed purposes -if (!PRODUCTION) { - if (!exists(".Random.seed")) set.seed(123) - covariates <- covariates |> - dplyr::anti_join(design_points, by = "site_id") |> - dplyr::slice_sample(n = 10000) |> - dplyr::bind_rows(design_covariates) # limit to 10k sites with reproducible sampling -} - -## TODO migrate to PEcAnAssimSequential -downscale_model_output <- function(date, model_output) { - filtered_ens_data <- PEcAnAssimSequential::subset_ensemble( - ensemble_data = ensemble_data, - site_coords = design_points, - date = date, - carbon_pool = model_output - ) - - # Downscale the data - downscale_output <- PEcAnAssimSequential::ensemble_downscale( - ensemble_data = filtered_ens_data, - site_coords = design_points, - covariates = covariates, - seed = 123 - ) - return(downscale_output) -} - -# not using furrr b/c it is used inside downscale -downscale_output_list <- purrr::map( - outputs_to_extract, - ~ { - PEcAn.logger::logger.info("Starting downscaling for", .x) - result <- downscale_model_output(date = end_date, model_output = .x) - PEcAn.logger::logger.info("Downscaling complete for", .x) - result - } -) |> - purrr::set_names(outputs_to_extract) - -PEcAn.logger::logger.info("Downscaling complete for all model outputs") - -## Save to make it easier to restart -#### ---Create checkpoint for downstream analysis---#### -checkpoint_file <- file.path(cache_dir, "downscaling_output.RData") -start_end <- system.time( - save(downscale_output_list, covariates, design_points, design_covariates, ensemble_ids, - file = checkpoint_file, - compress = FALSE - ) -) -PEcAn.logger::logger.info( - "Downscaling output objects saved to", checkpoint_file, - "\nIt took", round(start_end[3] / 60, 2), "minutes" -) - -PEcAn.logger::logger.info( - "🌟🌟🌟Finished downscaling🌟🌟🌟", - "\n\nCongratulations! You are almost there!\n\n", - rep("🚀", 10) -) - -###--- Print Metrics for Each Ensemble Member ---#### - -PEcAn.logger::logger.info("Downscaling model results for each ensemble member:") -metrics <- lapply(downscale_output_list, PEcAnAssimSequential::downscale_metrics) - -median_metrics <- purrr::map(metrics, function(m) { - m |> - dplyr::select(-ensemble) |> - dplyr::summarise( # do equivalent of colmeans but for medians - dplyr::across( - .cols = dplyr::everything(), - .fns = list(median = ~ median(.x)), - .names = "{col}" - ) - ) -}) - -PEcAn.logger::logger.info("Median downscaling model metrics:") -dplyr::bind_rows(median_metrics, .id = "model_output") |> - knitr::kable() - -#### ---- Aggregate to County Level ---- #### -#### TODO Split into separate script? - -PEcAn.logger::logger.info("Aggregating to County Level") - -ca_fields_full <- sf::read_sf(file.path(data_dir, "ca_fields.gpkg")) - -ca_fields <- ca_fields_full |> - dplyr::select(site_id, county, area_ha) - -if(!PRODUCTION) { - # For testing, use a subset of fields - # could be even faster if we queried from gpkg: - # sf::read_sf(..., sql = "SELECT * FROM ca_fields WHERE site_id IN (...)") - ca_fields <- ca_fields |> - dplyr::right_join(covariates, by = "site_id") -} - -# Convert list to table with predictions and site identifier -get_downscale_preds <- function(downscale_output_list) { - purrr::map( - downscale_output_list$predictions, - ~ tibble::tibble(site_id = covariates$site_id, prediction = .x) - ) |> - dplyr::bind_rows(.id = "ensemble") |> - dplyr::left_join(ca_fields, by = "site_id") -} - -downscale_preds <- purrr::map(downscale_output_list, get_downscale_preds) |> - dplyr::bind_rows(.id = "model_output") |> - # Convert kg/m2 to Mg/ha using PEcAn.utils::ud_convert - dplyr::mutate(c_density_Mg_ha = PEcAn.utils::ud_convert(prediction, "kg/m2", "Mg/ha")) |> - # Calculate total Mg per field: c_density_Mg_ha * area_ha - dplyr::mutate(total_c_Mg = c_density_Mg_ha * area_ha) - -### TODO Debug and catch if it appears again -na_summary <- downscale_preds |> - dplyr::summarise(dplyr::across(dplyr::everything(), ~ sum(is.na(.x)))) |> - tidyr::pivot_longer(dplyr::everything(), names_to = "column", values_to = "n_na") |> - dplyr::filter(n_na > 0) - -if (nrow(na_summary) > 0) { - ## concise log message - PEcAn.logger::logger.warn( - "YOU NEED TO DEBUG THIS!!!\n\n", - "NA values detected in `downscale_preds`:\n" - ) - knitr::kable(na_summary, format = "simple") - # remove all rows with NA values - downscale_preds <- tidyr::drop_na(downscale_preds) -} - -ens_county_preds <- downscale_preds |> - # Now aggregate to get county level totals for each pool x ensemble - dplyr::group_by(model_output, county, ensemble) |> - dplyr::summarize( - n = dplyr::n(), - total_c_Mg = sum(total_c_Mg), # total Mg C per county - total_ha = sum(area_ha), - .groups = "drop_last" - ) |> - dplyr::ungroup() |> - # counties with no fields will result in NA below - dplyr::filter(total_ha > 0) |> - dplyr::mutate( - total_c_Tg = PEcAn.utils::ud_convert(total_c_Mg, "Mg", "Tg"), - c_density_Mg_ha = total_c_Mg / total_ha - ) |> - dplyr::arrange(model_output, county, ensemble) - -ens_members_by_county <- ens_county_preds |> - dplyr::group_by(model_output, county) |> - dplyr::summarize(n_vals = dplyr::n_distinct(total_c_Mg), .groups = "drop") - -if(all(ens_members_by_county$n_vals == length(ensemble_ids))) { - PEcAn.logger::logger.info("All counties have the correct number of ensemble members: (", length(ensemble_ids), ")") -} else { - z <- ens_members_by_county |> - dplyr::group_by(county) |> - dplyr::summarise(n = mean(n_vals)) - PEcAn.logger::logger.error( - sum(z$n != length(ensemble_ids)) / length(z$n), - "counties have the wrong number of ensemble members after downscaling.", - "Check ens_county_preds object." - ) -} - -county_summaries <- ens_county_preds |> - dplyr::group_by(model_output, county) |> - dplyr::summarize( - # Number of fields in county should be same for each ensemble member - n = max(n), - mean_total_c_Tg = mean(total_c_Tg), - sd_total_c_Tg = sd(total_c_Tg), - mean_c_density_Mg_ha = mean(c_density_Mg_ha), - sd_c_density_Mg_ha = sd(c_density_Mg_ha), - .groups = "drop" - ) |> - dplyr::mutate( - # Only save 3 significant digits - dplyr::across( - .cols = c(mean_total_c_Tg, sd_total_c_Tg, mean_c_density_Mg_ha, sd_c_density_Mg_ha), - .fns = ~ signif(.x, 3) - ) - ) - -readr::write_csv( - county_summaries, - file.path(model_outdir, "county_summaries.csv") -) -PEcAn.logger::logger.info("County summaries written to", file.path(model_outdir, "county_summaries.csv")) - -PEcAn.logger::logger.info( - rep("🌟🌟🌟 ", 5), "\n\n", - "Finished aggregation to County level", "\n\n", - rep("🌟🌟🌟 ", 5) -) diff --git a/scripts/041_aggregate_to_county.R b/scripts/041_aggregate_to_county.R new file mode 100644 index 0000000..cb9875e --- /dev/null +++ b/scripts/041_aggregate_to_county.R @@ -0,0 +1,139 @@ +#### ---- Aggregate to County Level ---- #### +# Inputs: downscaling outputs saved by 040_downscale.R +# Outputs: county_summaries.csv + + +PEcAn.logger::logger.info("***Starting Aggregation to County Level***") + + + +# Load configuration and paths +source("000-config.R") + +downscale_preds_csv <- file.path(model_outdir, "downscaled_preds.csv") +downscale_preds <- vroom::vroom( + downscale_preds_csv, + col_types = readr::cols( + pft = readr::col_character(), + model_output = readr::col_character(), + ensemble = readr::col_double(), + site_id = readr::col_character(), + county = readr::col_character(), + area_ha = readr::col_double(), + c_density_Mg_ha = readr::col_double(), + total_c_Mg = readr::col_double() + ) +) + +ensemble_ids <- unique(downscale_preds$ensemble) + + +# For testing, sample predictions evenly across counties and pfts +if (!PRODUCTION) { + # Sample the same site_ids per county across all PFTs + site_sample <- downscale_preds |> + dplyr::distinct(county, site_id) |> + dplyr::group_by(county) |> + dplyr::slice_sample(n = pmin(10L, dplyr::n())) |> + dplyr::ungroup() + + downscale_preds <- downscale_preds |> + dplyr::inner_join(site_sample, by = c("county", "site_id")) +} + +### TODO Debug and catch if NAs appears again +na_summary <- downscale_preds |> + dplyr::summarise(dplyr::across(dplyr::everything(), ~ sum(is.na(.x)))) |> + tidyr::pivot_longer(dplyr::everything(), names_to = "column", values_to = "n_na") |> + dplyr::filter(n_na > 0) + +if (nrow(na_summary) > 0) { + ## concise log message + PEcAn.logger::logger.warn( + "YOU NEED TO DEBUG THIS!!!\n\n", + "NA values detected in `downscale_preds`:\n" + ) + knitr::kable(na_summary, format = "simple") + # remove all rows with NA values + downscale_preds <- tidyr::drop_na(downscale_preds) +} + +ens_county_preds <- downscale_preds |> + # Now aggregate to get county level totals for each pool x ensemble + dplyr::group_by(model_output, pft, county, ensemble) |> + dplyr::summarize( + n = dplyr::n(), + total_c_Mg = sum(total_c_Mg), # total Mg C per county + total_ha = sum(area_ha), + .groups = "drop" + ) |> + # counties with no fields will result in NA below + dplyr::filter(total_ha > 0) |> + dplyr::mutate( + total_c_Tg = PEcAn.utils::ud_convert(total_c_Mg, "Mg", "Tg"), + c_density_Mg_ha = total_c_Mg / total_ha + ) |> + dplyr::arrange(model_output, pft, county, ensemble) + +ens_members_by_county <- ens_county_preds |> + dplyr::group_by(model_output, pft, county) |> + dplyr::summarize(n_vals = dplyr::n_distinct(total_c_Mg), .groups = "drop") + +if (all(ens_members_by_county$n_vals == length(ensemble_ids))) { + PEcAn.logger::logger.info("All counties have the correct number of ensemble members: (", length(ensemble_ids), ")") +} else { + z <- ens_members_by_county |> + dplyr::group_by(county) |> + dplyr::summarise(n = mean(n_vals)) + PEcAn.logger::logger.error( + sum(z$n != length(ensemble_ids)) / length(z$n), + "counties have the wrong number of ensemble members after downscaling.", + "Check ens_county_preds object." + ) +} + +county_summaries <- ens_county_preds |> + dplyr::group_by(model_output, pft, county) |> + dplyr::summarize( + # Number of fields in county should be same for each ensemble member + n = max(n), + mean_total_c_Tg = mean(total_c_Tg), + sd_total_c_Tg = sd(total_c_Tg), + mean_c_density_Mg_ha = mean(c_density_Mg_ha), + sd_c_density_Mg_ha = sd(c_density_Mg_ha), + mean_total_ha = mean(total_ha), + sd_total_ha = sd(total_ha), + .groups = "drop" + ) |> + ( + function(df) { + if (any(df$n == 1, na.rm = TRUE)) { + PEcAn.logger::logger.severe( + "At least one (model_output, pft, county) group has n == 1; variability across ensembles cannot be assessed." + ) + } + if (any(df$sd_total_c_Tg == 0, na.rm = TRUE)) { + PEcAn.logger::logger.severe( + "At least one (model_output, pft, county) group has zero variability across ensembles (sd_total_c_Tg == 0)." + ) + } + df + } + ) |> + dplyr::mutate( + # Only save 3 significant digits + dplyr::across( + .cols = c(mean_total_c_Tg, sd_total_c_Tg, mean_c_density_Mg_ha, sd_c_density_Mg_ha), + .fns = ~ signif(.x, 3) + ) + ) + +readr::write_csv( + county_summaries, + file.path(model_outdir, "county_summaries.csv") +) +PEcAn.logger::logger.info("County summaries written to", file.path(model_outdir, "county_summaries.csv")) + +PEcAn.logger::logger.info( + "Finished aggregation to county level.\n" +) diff --git a/scripts/041_downscale_analysis.R b/scripts/041_downscale_analysis.R deleted file mode 100644 index f446877..0000000 --- a/scripts/041_downscale_analysis.R +++ /dev/null @@ -1,187 +0,0 @@ -# Load required libraries and data -source("000-config.R") -checkpoint_file <- file.path(cache_dir, "downscaling_output.RData") -checkpoint_objects <- load(checkpoint_file) -PEcAn.logger::logger.info("Loaded checkpoint objects:", paste(checkpoint_objects, collapse = ",")) -# downscale_output_list -# covariates -# design_points -# design_covariates -# ensemble_ids - -##### Variable Importance Analysis ##### -importance_summary <- purrr::map_dfr(outputs_to_extract, ~ { - # Extract the importance for each ensemble model in the carbon pool - importances <- purrr::map(ensemble_ids, function(i) { - model <- downscale_output_list[[.x]][["model"]][[i]] - randomForest::importance(model)[, "%IncMSE"] - }) - - # Turn the list of importance vectors into a data frame - importance_df <- purrr::map_dfr(importances, ~ tibble::tibble(importance = .x), .id = "ensemble") |> - dplyr::group_by(ensemble) |> - dplyr::mutate(predictor = names(importances[[1]])) |> - dplyr::ungroup() - - # Now summarize median and IQR for each predictor across ensembles - summary_df <- importance_df |> - dplyr::group_by(predictor) |> - dplyr::summarize( - median_importance = median(importance, na.rm = TRUE), - lcl_importance = quantile(importance, 0.25, na.rm = TRUE), - ucl_importance = quantile(importance, 0.75, na.rm = TRUE), - .groups = "drop" - ) |> - dplyr::mutate(model_output = .x) - return(summary_df) -}) - -for (output in outputs_to_extract) { - # Top 2 predictors for this carbon pool - top_predictors <- importance_summary |> - dplyr::filter(model_output == output) |> - dplyr::arrange(dplyr::desc(median_importance)) |> - dplyr::slice_head(n = 2) |> - dplyr::pull(predictor) - - # Prepare model and subset of covariates for plotting - model <- downscale_output_list[[output]][["model"]][[1]] - - # Set up PNG for three panel plot - PEcAn.logger::logger.info("Creating importance and partial plots for", output) - importance_partial_plot_fig <- here::here( - "figures", - paste0(output, "_importance_partial_plots.png") - ) # Ensure the directory exists - - png( - filename = importance_partial_plot_fig, - width = 14, height = 6, units = "in", res = 300, bg = "white" - ) - par(mfrow = c(1, 3)) - - # Panel 1: Variable importance plot - output_importance <- importance_summary |> - dplyr::filter(model_output == output) - par(mar = c(5, 10, 4, 2)) - with( - output_importance, - dotchart(median_importance, - labels = reorder(predictor, median_importance), - xlab = "Median Increase MSE (SD)", - main = paste("Importance -", output), - pch = 19, col = "steelblue", cex = 1.2 - ) - ) - with( - output_importance, - segments(lcl_importance, - seq_along(predictor), - ucl_importance, - seq_along(predictor), - col = "gray50" - ) - ) - - # Panel 2: Partial plot for top predictor - par(mar = c(5, 5, 4, 2)) - randomForest::partialPlot(model, - pred.data = design_covariates, - x.var = top_predictors[1], - main = paste("Partial Dependence -", top_predictors[1]), - xlab = top_predictors[1], - ylab = paste("Predicted", output), - col = "steelblue", - lwd = 2 - ) - - # Panel 3: Partial plot for second predictor - randomForest::partialPlot(model, - pred.data = design_covariates, - x.var = top_predictors[2], - main = paste("Partial Dependence -", top_predictors[2]), - xlab = top_predictors[2], - ylab = paste("Predicted", output), - col = "steelblue", - lwd = 2 - ) - dev.off() # Save combined figure - PEcAn.logger::logger.info( - "Saved importance and partial plots for", - output, " to ", - importance_partial_plot_fig - ) -} - - -## ALE Plots -# robust marginal effect estimates even with correlated predictors. -# library(iml) - -PEcAn.logger::logger.info("***Starting ALE plots***") - -for (output in outputs_to_extract) { - model <- downscale_output_list[[output]][["model"]][[1]] - # Use design points covariates instead of all covariates - - top_predictors <- importance_summary |> - dplyr::filter(model_output == output) |> - dplyr::arrange(dplyr::desc(median_importance)) |> - dplyr::slice_head(n = 2) |> - dplyr::pull(predictor) - - predictor_obj <- iml::Predictor$new( - model = model, - data = design_covariates, - y = NULL, - predict.function = function(m, newdata) predict(m, newdata) - ) - - for (i in seq_along(top_predictors)) { - pred_var_name <- top_predictors[i] - - PEcAn.logger::logger.info("Starting ALE calculation for predictor:", pred_var_name) - ale <- iml::FeatureEffect$new(predictor_obj, feature = pred_var_name, method = "ale") - - PEcAn.logger::logger.info("Saving ALE plot for predictor:", pred_var_name) - ggplot2::ggsave( - filename = here::here("figures", paste0(output, "_ALE_predictor", i, ".png")), - plot = plot(ale) + ggplot2::ggtitle(paste("ALE for", pred_var_name, "on", output)), - width = 6, height = 4, units = "in", dpi = 300 - ) - } -} - -## ICE Plots -PEcAn.logger::logger.info("Creating ICE plots for top predictors") - -for (output in outputs_to_extract) { - model <- downscale_output_list[[output]][["model"]][[1]] - # Use design points covariates instead of all covariates - - - top_predictors <- importance_summary |> - dplyr::filter(model_output == output) |> - dplyr::arrange(dplyr::desc(median_importance)) |> - dplyr::slice_head(n = 2) |> - dplyr::pull(predictor) - - predictor_obj <- iml::Predictor$new( - model = model, - data = design_covariates, - y = NULL, # y is not used by predict.randomForest - predict.function = function(m, newdata) stats::predict(m, newdata) - ) - - for (i in seq_along(top_predictors)) { - pred_var_name <- top_predictors[i] - ice <- iml::FeatureEffect$new(predictor_obj, feature = pred_var_name, method = "ice") - - # Save plot - ggplot2::ggsave( - filename = here::here("figures", paste0(output, "_ICE_predictor", i, ".png")), - plot = plot(ice) + ggplot2::ggtitle(paste("ICE for", pred_var_name, "on", output)), - width = 6, height = 4, units = "in", dpi = 300 - ) - } -} diff --git a/scripts/042_county_level_plots.R b/scripts/042_county_level_plots.R deleted file mode 100644 index 6f0a5dc..0000000 --- a/scripts/042_county_level_plots.R +++ /dev/null @@ -1,63 +0,0 @@ -## Create county-level plots -source("000-config.R") -PEcAn.logger::logger.info("Creating county-level plots") -county_boundaries <- sf::st_read(file.path(data_dir, "ca_counties.gpkg")) -county_summaries <- readr::read_csv(file = file.path(model_outdir, "county_summaries.csv")) - -co_preds_to_plot <- county_summaries |> - dplyr::right_join(county_boundaries, by = "county") |> - dplyr::arrange(county, model_output) |> - tidyr::pivot_longer( - cols = c(mean_total_c_Tg, sd_total_c_Tg, mean_c_density_Mg_ha, sd_c_density_Mg_ha), - names_to = "stat", - values_to = "value" - ) |> - dplyr::mutate( - units = dplyr::case_when( - stringr::str_detect(stat, "total_c") ~ "Carbon Stock (Tg)", - stringr::str_detect(stat, "c_density") ~ "Carbon Density (Mg/ha)" - ), - stat = dplyr::case_when( - stringr::str_detect(stat, "mean") ~ "Mean", - stringr::str_detect(stat, "sd") ~ "SD" - ) - ) - -units <- rep(unique(co_preds_to_plot$units), each = length(outputs_to_extract)) -pool_x_units <- co_preds_to_plot |> - dplyr::select(model_output, units) |> - dplyr::distinct() |> - # remove na - dplyr::filter(!is.na(model_output)) |> # why is one field in SF county NA? - dplyr::arrange(model_output, units) |> - dplyr::filter(!is.na(model_output)) - -p <- purrr::map2(pool_x_units$model_output, pool_x_units$units, function(pool, unit) { - .p <- ggplot2::ggplot( - dplyr::filter(co_preds_to_plot, model_output == pool & units == unit), - ggplot2::aes(geometry = geom, fill = value) - ) + - ggplot2::geom_sf(data = county_boundaries, fill = "lightgrey", color = "black") + - ggplot2::geom_sf() + - ggplot2::scale_fill_viridis_c(option = "plasma") + - ggplot2::theme_minimal() + - ggplot2::facet_grid(model_output ~ stat) + - ggplot2::labs( - title = paste("County-Level Predictions for", pool, unit), - fill = "Value" - ) - - unit <- ifelse(unit == "Carbon Stock (Tg)", "stock", - ifelse(unit == "Carbon Density (Mg/ha)", "density", NA) - ) - - plotfile <- here::here("figures", paste0("county_", pool, "_carbon_", unit, ".png")) - PEcAn.logger::logger.info("Creating county-level plot for", pool, unit) - ggplot2::ggsave( - plot = .p, - filename = plotfile, - width = 10, height = 5, - bg = "white" - ) - return(.p) -}) diff --git a/scripts/042_downscale_analysis.R b/scripts/042_downscale_analysis.R new file mode 100644 index 0000000..d1319b7 --- /dev/null +++ b/scripts/042_downscale_analysis.R @@ -0,0 +1,236 @@ +## Downscale analysis using predictions and covariates +source("000-config.R") + +# Variable Importance (VI) + +preds_csv <- file.path(model_outdir, "downscaled_preds.csv") +meta_json <- file.path(model_outdir, "downscaled_preds_metadata.json") + +downscale_preds <- vroom::vroom( + preds_csv, + col_types = readr::cols( + site_id = readr::col_character(), + pft = readr::col_character(), + ensemble = readr::col_double(), + c_density_Mg_ha = readr::col_double(), + total_c_Mg = readr::col_double(), + area_ha = readr::col_double(), + county = readr::col_character(), + model_output = readr::col_character() + ) +) + +meta <- jsonlite::read_json(meta_json, simplifyVector = TRUE) +ensemble_ids <- if (!is.null(meta$ensembles)) meta$ensembles else sort(unique(downscale_preds$ensemble)) + +covariates_csv <- file.path(data_dir, "site_covariates.csv") + +covariates <- readr::read_csv(covariates_csv) |> + dplyr::select(site_id, where(is.numeric), -climregion_id) +covariate_names <- names(dplyr::select(covariates, where(is.numeric))) +PEcAn.logger::logger.info("Loaded predictions, metadata, and covariates") +PEcAn.logger::logger.info("Rows in predictions:", nrow(downscale_preds), "; unique sites:", dplyr::n_distinct(downscale_preds$site_id)) +PEcAn.logger::logger.info("Ensembles detected:", paste(head(ensemble_ids, 10), collapse = ", "), if (length(ensemble_ids) > 10) "..." else "") +PEcAn.logger::logger.info("Number of numeric predictors:", length(covariate_names), "; sample:", paste(utils::head(covariate_names, 6), collapse = ", ")) + +preds_join <- downscale_preds |> + dplyr::left_join(covariates, by = "site_id") |> + tidyr::drop_na(c_density_Mg_ha) + +# Optional: load training site metadata +train_sites_csv <- file.path(model_outdir, "training_sites.csv") +train_sites <- NULL +train_sites <- readr::read_csv(train_sites_csv, show_col_types = FALSE) +PEcAn.logger::logger.info("Training site metadata found:", train_sites_csv, "; rows:", nrow(train_sites)) +PEcAn.logger::logger.info("Training site columns:", paste(colnames(train_sites), collapse = ", ")) + +# Build spec table from predictions (includes mixed pft) +spec_table <- preds_join |> + dplyr::distinct(pft, model_output) |> + dplyr::arrange(pft, model_output) +PEcAn.logger::logger.info("Data prep complete") + +## No surrogate VI: read per-ensemble VI saved by 040 and summarize here +vi_path_for_spec <- function(pft_i, pool) { + spec_key <- paste0(janitor::make_clean_names(pft_i), "_", janitor::make_clean_names(pool)) + file.path(model_outdir, paste0("vi_", spec_key, "_by_ensemble.csv")) +} + +importance_summary <- purrr::pmap_dfr( + list(spec_table$pft, spec_table$model_output), + function(pft_i, pool) { + vi_file <- vi_path_for_spec(pft_i, pool) + if (!file.exists(vi_file)) { + PEcAn.logger::logger.warn("VI per-ensemble CSV not found for ", pft_i, "::", pool) + return(NULL) + } + vi_tbl <- readr::read_csv(vi_file, show_col_types = FALSE) + vi_tbl |> + dplyr::group_by(pft, model_output, predictor) |> + dplyr::summarize( + median_importance = stats::median(importance, na.rm = TRUE), + lcl_importance = stats::quantile(importance, 0.25, na.rm = TRUE), + ucl_importance = stats::quantile(importance, 0.75, na.rm = TRUE), + n_ensembles = dplyr::n(), + .groups = "drop" + ) + } +) + +## ALE Plots +# Robust marginal effect estimates even with correlated predictors. +# Run ALE/ICE first to prioritize unbiased effects under correlation. +PEcAn.logger::logger.info("Starting ALE and ICE plots for all specs (priority)") +for (row in seq_len(nrow(spec_table))) { + pft_i <- spec_table$pft[row] + pool <- spec_table$model_output[row] + PEcAn.logger::logger.info("Starting ALE/ICE plots for spec:", paste(pft_i, pool, sep = "::")) + spec_key <- paste0(janitor::make_clean_names(pft_i), "_", janitor::make_clean_names(pool)) + mdl_path <- file.path(cache_dir, "models", paste0(spec_key, "_models.rds")) + trn_path <- file.path(cache_dir, "training_data", paste0(spec_key, "_training.csv")) + if (!file.exists(mdl_path) || !file.exists(trn_path)) { + PEcAn.logger::logger.warn("Saved model/training data missing for ", pft_i, "::", pool, "; skipping ALE/ICE") + next + } + models <- readRDS(mdl_path) + df_spec <- readr::read_csv(trn_path, show_col_types = FALSE) + rf <- models[[1]] + # Use design-point covariates for ALE/ICE as in the original script + design_covariates <- dplyr::select(df_spec, dplyr::all_of(covariate_names)) |> as.data.frame() + requireNamespace("randomForest", quietly = TRUE) + predictor_obj <- iml::Predictor$new( + model = rf, data = design_covariates, y = NULL, + predict.function = function(m, newdata) stats::predict(m, newdata) + ) + top_predictors <- importance_summary |> + dplyr::filter(model_output == pool, pft == pft_i) |> + dplyr::arrange(dplyr::desc(median_importance)) |> + dplyr::slice_head(n = 2) |> + dplyr::pull(predictor) + for (j in seq_along(top_predictors)) { + pred_var_name <- top_predictors[j] + ale <- iml::FeatureEffect$new(predictor_obj, feature = pred_var_name, method = "ale") + ggsave_optimized( + filename = here::here("figures", paste0(janitor::make_clean_names(pft_i), "_", janitor::make_clean_names(pool), "_ALE_predictor", j, ".svg")), + plot = plot(ale) + ggplot2::ggtitle(paste("ALE for", pred_var_name, "on", pool, "-", pft_i)), + width = 6, height = 4, units = "in" + ) + ice <- iml::FeatureEffect$new(predictor_obj, feature = pred_var_name, method = "ice") + ggsave_optimized( + filename = here::here("figures", paste0(janitor::make_clean_names(pft_i), "_", janitor::make_clean_names(pool), "_ICE_predictor", j, ".svg")), + plot = plot(ice) + ggplot2::ggtitle(paste("ICE for", pred_var_name, "on", pool, "-", pft_i)), + width = 6, height = 4, units = "in" + ) + } +} + +for (row in seq_len(nrow(spec_table))) { + pft_i <- spec_table$pft[row] + pool <- spec_table$model_output[row] + PEcAn.logger::logger.info("Processing plots for spec:", paste(pft_i, pool, sep = "::")) + + spec_key <- paste0(janitor::make_clean_names(pft_i), "_", janitor::make_clean_names(pool)) + mdl_path <- file.path(cache_dir, "models", paste0(spec_key, "_models.rds")) + trn_path <- file.path(cache_dir, "training_data", paste0(spec_key, "_training.csv")) + rf <- NULL + + # Here, the cached training CSV corresponds to the design-point covariates used to train the RF. + design_covariates <- NULL + if (file.exists(mdl_path) && file.exists(trn_path)) { + models <- readRDS(mdl_path) + df_spec <- readr::read_csv(trn_path, show_col_types = FALSE) + rf <- models[[1]] + if (inherits(rf, "randomForest")) { + design_covariates <- dplyr::select(df_spec, dplyr::all_of(covariate_names)) |> + as.data.frame() + } + } + if (is.null(rf)) { + PEcAn.logger::logger.warn("Saved model not found for ", pft_i, "::", pool, "; skipping PDP") + next + } + + top_predictors <- importance_summary |> + dplyr::filter(model_output == pool, pft == pft_i) |> + dplyr::arrange(dplyr::desc(median_importance)) |> + dplyr::slice_head(n = 2) |> + dplyr::pull(predictor) + + if (length(top_predictors) < 2) { + PEcAn.logger::logger.warn("Not enough predictors for partial plots:", paste(pft_i, pool, sep = "::")) + next + } + + # Only produce PDPs if features are sufficiently uncorrelated + corr_threshold <- suppressWarnings(as.numeric(Sys.getenv("PDP_CORR_THRESHOLD", unset = "0.3"))) + if (is.na(corr_threshold)) corr_threshold <- 0.3 + allow_pdp <- FALSE + if (!is.null(design_covariates) && ncol(design_covariates) > 1) { + cm <- try(stats::cor(design_covariates, use = "pairwise.complete.obs"), silent = TRUE) + if (!inherits(cm, "try-error")) { + # For each top predictor, require low correlation with all other predictors + max_corr <- vapply(top_predictors, function(v) { + others <- setdiff(colnames(cm), v) + if (!v %in% colnames(cm) || length(others) == 0) return(1) + max(abs(cm[v, others]), na.rm = TRUE) + }, numeric(1)) + allow_pdp <- all(max_corr <= corr_threshold) + PEcAn.logger::logger.info( + sprintf("PDP correlation check (max abs corr per var): %s; threshold=%.2f", + paste(sprintf("%s=%.2f", top_predictors, max_corr), collapse = ", "), corr_threshold) + ) + } + } + if (!allow_pdp) { + PEcAn.logger::logger.info("Skipping PDPs due to correlated predictors; rely on ALE/ICE.") + next + } + + PEcAn.logger::logger.info("Creating importance and partial plots (PDP) for", paste(pft_i, pool, sep = "::")) + clean_pft <- janitor::make_clean_names(pft_i) + clean_pool <- janitor::make_clean_names(pool) + importance_partial_plot_fig <- here::here( + "figures", + paste0(clean_pft, "_", clean_pool, "_importance_partial_plots.png") + ) + + png(filename = importance_partial_plot_fig, width = 14, height = 6, units = "in", res = 300, bg = "white") + par(mfrow = c(1, 3)) + + # Panel 1: Variable importance plot + output_importance <- importance_summary |> + dplyr::filter(model_output == pool, pft == pft_i) + par(mar = c(5, 10, 4, 2)) + with( + output_importance, + dotchart(median_importance, + labels = reorder(predictor, median_importance), + xlab = "Median Increase MSE (SD)", + main = paste("Importance -", pool, "-", pft_i), + pch = 19, col = "steelblue", cex = 1.2 + ) + ) + with( + output_importance, + segments(lcl_importance, seq_along(predictor), ucl_importance, seq_along(predictor), col = "gray50") + ) + + # Panel 2 and 3: Partial plots for top predictors + par(mar = c(5, 5, 4, 2)) + # Set common y-limits from training response + y_range <- range(rf$y, na.rm = TRUE) + yl <- c(floor(min(y_range)), ceiling(max(y_range))) + + randomForest::partialPlot(rf, + pred.data = design_covariates, x.var = top_predictors[1], ylim = yl, + main = paste("Partial Dependence -", top_predictors[1]), + xlab = top_predictors[1], ylab = paste("Predicted", pool, "-", pft_i), col = "steelblue", lwd = 2 + ) + randomForest::partialPlot(rf, + pred.data = design_covariates, x.var = top_predictors[2], ylim = yl, + main = paste("Partial Dependence -", top_predictors[2]), + xlab = top_predictors[2], ylab = paste("Predicted", pool, "-", pft_i), col = "steelblue", lwd = 2 + ) + dev.off() + PEcAn.logger::logger.info("Saved importance/PDP figure:", importance_partial_plot_fig) +} diff --git a/scripts/043_county_level_plots.R b/scripts/043_county_level_plots.R new file mode 100644 index 0000000..e6ec75e --- /dev/null +++ b/scripts/043_county_level_plots.R @@ -0,0 +1,314 @@ +## Create county-level plots +## +## Usage: +## Rscript scripts/043_county_level_plots.R +## +## Inputs (from 000-config.R paths): +## - model_outdir/out/county_summaries.csv (from 041_aggregate_to_county.R) +## - model_outdir/out/downscaled_preds.csv (for mixed-scenario diffs and deltas) +## - model_outdir/out/downscaled_deltas.csv (optional; for delta maps) +## - data/ca_counties.gpkg (county boundaries) +## +## Outputs: +## - figures/county___carbon_{density,stock}.webp +## - figures/county_diff_woody_plus_annual_minus_woody__carbon_{density,stock}.webp +## - figures/county_delta___carbon_{density,stock}.webp +## - Additional poster-grade exports for selected maps (PDF/SVG/PNG) +## +## Notes: +## - Requires R/ggsave_optimized.R to be sourced via 000-config.R +## - Uses facet columns for Mean and SD in each output +source("000-config.R") +PEcAn.logger::logger.info("Creating county-level plots") +county_boundaries <- sf::st_read(file.path(data_dir, "ca_counties.gpkg")) +county_summaries <- readr::read_csv(file = file.path(model_outdir, "county_summaries.csv")) + +co_preds_to_plot <- county_summaries |> + dplyr::right_join(county_boundaries, by = "county") |> + dplyr::arrange(county, model_output, pft) |> + tidyr::pivot_longer( + cols = c(mean_total_c_Tg, sd_total_c_Tg, mean_c_density_Mg_ha, sd_c_density_Mg_ha), + names_to = "stat", + values_to = "value" + ) |> + dplyr::mutate( + units = dplyr::case_when( + stringr::str_detect(stat, "total_c") ~ "Carbon Stock (Tg)", + stringr::str_detect(stat, "c_density") ~ "Carbon Density (Mg/ha)" + ), + stat = dplyr::case_when( + stringr::str_detect(stat, "mean") ~ "Mean", + stringr::str_detect(stat, "sd") ~ "SD" + ) + ) + +combos <- co_preds_to_plot |> + dplyr::filter(!is.na(model_output), !is.na(pft)) |> + dplyr::distinct(pft, model_output, units) |> + dplyr::arrange(pft, model_output, units) + +# Optional filters via env vars to generate a subset only +pft_filter <- Sys.getenv("PFT_FILTER", unset = "") +pool_filter <- Sys.getenv("POOL_FILTER", unset = "") +units_filter <- Sys.getenv("UNITS_FILTER", unset = "") +if (nzchar(pft_filter)) combos <- dplyr::filter(combos, pft == !!pft_filter) +if (nzchar(pool_filter)) combos <- dplyr::filter(combos, model_output == !!pool_filter) +if (nzchar(units_filter)) combos <- dplyr::filter(combos, units == !!units_filter) + +# Color scale controls +# Priority: COLOR_SCALE if set -> POSTER_PALETTE if TRUE -> default (plasma) +use_poster_palette <- isTRUE(as.logical(Sys.getenv("POSTER_PALETTE", "FALSE"))) +color_scale_opt <- tolower(Sys.getenv("COLOR_SCALE", unset = "")) +custom_colors <- Sys.getenv("CUSTOM_COLORS", unset = "") # comma-separated hex +custom_values <- Sys.getenv("CUSTOM_VALUES", unset = "") # comma-separated numerics in [0,1] + +fill_scale <- NULL +if (nzchar(color_scale_opt)) { + if (color_scale_opt %in% c("viridis","plasma","magma","inferno","cividis")) { + fill_scale <- ggplot2::scale_fill_viridis_c(option = color_scale_opt, na.value = "white") + } else if (color_scale_opt == "custom" && nzchar(custom_colors)) { + cols <- trimws(strsplit(custom_colors, ",", fixed = TRUE)[[1]]) + vals <- if (nzchar(custom_values)) as.numeric(trimws(strsplit(custom_values, ",", fixed = TRUE)[[1]])) else NULL + if (is.null(vals)) { + fill_scale <- ggplot2::scale_fill_gradientn(colors = cols, na.value = "white") + } else { + fill_scale <- ggplot2::scale_fill_gradientn(colors = cols, values = vals, na.value = "white") + } + } +} +if (is.null(fill_scale)) { + if (use_poster_palette) { + # Poster palette requested earlier: light green -> poster green -> poster purple + fill_scale <- ggplot2::scale_fill_gradientn( + colors = c("#d7f2d3", "#62b874", "#4a2f86"), + values = c(0, 0.6, 1), + na.value = "white" + ) + } else { + fill_scale <- ggplot2::scale_fill_viridis_c(option = "plasma", na.value = "white") + } +} + +p <- purrr::pmap( + list(combos$pft, combos$model_output, combos$units), + function(.pft, pool, unit) { + dat <- dplyr::filter(co_preds_to_plot, pft == .pft, model_output == pool, units == unit) |> + dplyr::mutate(value_plot = value) + .plt <- ggplot2::ggplot( + dat, + ggplot2::aes(geometry = geom, fill = value_plot) + ) + + ggplot2::geom_sf(data = county_boundaries, mapping = ggplot2::aes(geometry = geom), fill = "white", color = "black", inherit.aes = FALSE) + + ggplot2::geom_sf() + + fill_scale + + ggplot2::theme_minimal() + + ggplot2::facet_grid(model_output ~ stat) + + ggplot2::labs( + title = paste("County-Level", pool, "-", .pft), + fill = unit + ) + + ggplot2::guides(fill = ggplot2::guide_colorbar(title.position = "top")) + + unit_key <- ifelse(unit == "Carbon Stock (Tg)", "stock", + ifelse(unit == "Carbon Density (Mg/ha)", "density", NA) + ) + pft_key <- stringr::str_replace_all(.pft, "[^A-Za-z0-9]+", "_") + plotfile <- here::here("figures", paste0("county_", pft_key, "_", pool, "_carbon_", unit_key, ".webp")) + PEcAn.logger::logger.info("Creating county-level plot for", pool, unit, "PFT:", .pft) + ggsave_optimized( + filename = plotfile, + plot = .plt, + width = 10, height = 5, units = "in", dpi = 96, + bg = "white" + ) + # Also export poster-grade formats for the AGB density map of woody perennials + if (.pft == "woody perennial crop" && pool == "AGB" && unit_key == "density") { + basefile <- here::here("figures", paste0("county_", pft_key, "_", pool, "_carbon_", unit_key)) + PEcAn.logger::logger.info("Exporting poster-grade versions (28.36x14.18 in):", paste0(basename(basefile), "{.pdf,.svg,.png}")) + # Vector formats (preferred for print) at 14.18 in tall (2:1 aspect) + ggsave_optimized(filename = paste0(basefile, ".pdf"), plot = .plt, width = 28.36, height = 14.18, units = "in", bg = "white") + ggsave_optimized(filename = paste0(basefile, ".svg"), plot = .plt, width = 28.36, height = 14.18, units = "in", bg = "white") + # High-resolution raster as fallback + ggsave_optimized(filename = paste0(basefile, ".png"), plot = .plt, width = 28.36, height = 14.18, units = "in", dpi = 300, bg = "white") + } + # High-res PNG for woody perennials SOC stock (12" tall for print) + if (.pft == "woody perennial crop" && pool == "TotSoilCarb" && unit_key == "stock") { + basefile_soc_stock <- here::here("figures", paste0("county_", pft_key, "_", pool, "_carbon_", unit_key)) + # Maintain 2:1 aspect (width:height) used above: width=24in, height=12in + PEcAn.logger::logger.info("Exporting 12in-tall high-res PNG:", paste0(basename(basefile_soc_stock), ".png")) + ggsave_optimized(filename = paste0(basefile_soc_stock, ".png"), plot = .plt, + width = 28.36, height = 14.18, units = "in", dpi = 300, bg = "white") + } + # High-res exports for annual crop SOC stock + if (.pft == "annual crop" && pool == "TotSoilCarb" && unit_key == "stock") { + basefile_ann_soc_stock <- here::here("figures", paste0("county_", pft_key, "_", pool, "_carbon_", unit_key)) + PEcAn.logger::logger.info("Exporting high-res annual SOC stock (28.36x14.18 in):", paste0(basename(basefile_ann_soc_stock), "{.pdf,.png}")) + ggsave_optimized(filename = paste0(basefile_ann_soc_stock, ".pdf"), plot = .plt, + width = 28.36, height = 14.18, units = "in", bg = "white") + ggsave_optimized(filename = paste0(basefile_ann_soc_stock, ".png"), plot = .plt, + width = 28.36, height = 14.18, units = "in", dpi = 300, bg = "white") + } + # Export high-res PDF/PNG for County-level woody+herbaceous (mixed) SOC maps + if (pool == "TotSoilCarb" && (grepl("woody", .pft, ignore.case = TRUE)) && + (grepl("annual", .pft, ignore.case = TRUE) || grepl("herb", .pft, ignore.case = TRUE))) { + basefile_soc <- here::here("figures", paste0("county_", pft_key, "_", pool, "_carbon_", unit_key)) + PEcAn.logger::logger.info("Exporting high-res SOC versions (28.36x14.18 in):", paste0(basename(basefile_soc), "{.pdf,.png}")) + ggsave_optimized(filename = paste0(basefile_soc, ".pdf"), plot = .plt, width = 28.36, height = 14.18, units = "in", bg = "white") + ggsave_optimized(filename = paste0(basefile_soc, ".png"), plot = .plt, width = 28.36, height = 14.18, units = "in", dpi = 300, bg = "white") + } + return(.plt) + } +) + +## --- County-level differences from matched fields: (woody + annual) minus (woody perennial crop) --- ## +dp <- vroom::vroom( + file.path(model_outdir, "downscaled_preds.csv"), + col_types = readr::cols( + site_id = readr::col_character(), + pft = readr::col_character(), + ensemble = readr::col_double(), + c_density_Mg_ha = readr::col_double(), + total_c_Mg = readr::col_double(), + area_ha = readr::col_double(), + county = readr::col_character(), + model_output = readr::col_character() + ) +) +mix <- dp |> + dplyr::filter(pft == "woody + annual") |> + dplyr::select(site_id, ensemble, model_output, county, area_ha_mix = area_ha, total_c_Mg_mix = total_c_Mg) +wood <- dp |> + dplyr::filter(pft == "woody perennial crop") |> + dplyr::select(site_id, ensemble, model_output, county, area_ha_woody = area_ha, total_c_Mg_woody = total_c_Mg) + +# Log if duplicate keys remain (should be rare after upstream normalization) +mix_dups <- mix |> + dplyr::count(site_id, ensemble, model_output, county) |> + dplyr::filter(n > 1) +wood_dups <- wood |> + dplyr::count(site_id, ensemble, model_output, county) |> + dplyr::filter(n > 1) +if (nrow(mix_dups) > 0 || nrow(wood_dups) > 0) { + PEcAn.logger::logger.severe(paste0( + "Duplicate keys detected prior to join (mix=", nrow(mix_dups), ", wood=", nrow(wood_dups), "). ", + "Fix upstream duplication; refusing to silently aggregate." + )) +} + +diff_county <- mix |> + dplyr::inner_join(wood, by = c("site_id", "ensemble", "model_output", "county")) |> + dplyr::mutate(diff_total_Mg = total_c_Mg_mix - total_c_Mg_woody, area_ha = dplyr::coalesce(area_ha_woody, area_ha_mix)) |> + dplyr::group_by(county, model_output, ensemble) |> + dplyr::summarise(diff_total_Mg = sum(diff_total_Mg), total_ha = sum(area_ha), .groups = "drop") |> + dplyr::mutate(diff_total_Tg = PEcAn.utils::ud_convert(diff_total_Mg, "Mg", "Tg"), diff_density_Mg_ha = diff_total_Mg / total_ha) |> + dplyr::group_by(county, model_output) |> + dplyr::summarise(mean_diff_total_Tg = mean(diff_total_Tg), mean_diff_density_Mg_ha = mean(diff_density_Mg_ha), .groups = "drop") |> + dplyr::right_join(county_boundaries, by = "county") + +for (pool in unique(stats::na.omit(diff_county$model_output))) { + dat_pool <- dplyr::filter(diff_county, model_output == pool) + + # Density difference map (Mg/ha) + p_density <- ggplot2::ggplot(dat_pool, ggplot2::aes(geometry = geom, fill = mean_diff_density_Mg_ha)) + + ggplot2::geom_sf(data = county_boundaries, fill = "white", color = "black") + + ggplot2::geom_sf() + + ggplot2::scale_fill_gradient2(low = "royalblue", mid = "white", high = "firebrick", midpoint = 0, na.value = "white") + + ggplot2::theme_minimal() + + ggplot2::labs( + title = paste("Difference in Carbon Density (Mg/ha): (woody + annual) - (woody)", pool), + fill = "Delta (Mg/ha)" + ) + ggsave_optimized( + filename = here::here("figures", paste0("county_diff_woody_plus_annual_minus_woody_", pool, "_carbon_density.webp")), + plot = p_density, + width = 10, height = 5, units = "in", dpi = 96, bg = "white" + ) + + # Stock difference map (Tg) + p_stock <- ggplot2::ggplot(dat_pool, ggplot2::aes(geometry = geom, fill = mean_diff_total_Tg)) + + ggplot2::geom_sf(data = county_boundaries, fill = "white", color = "black") + + ggplot2::geom_sf() + + ggplot2::scale_fill_gradient2(low = "royalblue", mid = "white", high = "firebrick", midpoint = 0, na.value = "white") + + ggplot2::theme_minimal() + + ggplot2::labs( + title = paste("Difference in Carbon Stock (Tg): (woody + annual) - (woody)", pool), + fill = "Delta (Tg)" + ) + ggsave_optimized( + filename = here::here("figures", paste0("county_diff_woody_plus_annual_minus_woody_", pool, "_carbon_stock.webp")), + plot = p_stock, + width = 10, height = 5, units = "in", dpi = 96, bg = "white" + ) +} + +## --- County-level deltas: start->end for woody and annual --- ## +delta_csv <- file.path(model_outdir, "downscaled_deltas.csv") +if (file.exists(delta_csv)) { + deltas <- vroom::vroom( + delta_csv, + col_types = readr::cols( + site_id = readr::col_character(), + pft = readr::col_character(), + ensemble = readr::col_double(), + delta_c_density_Mg_ha = readr::col_double(), + delta_total_c_Mg = readr::col_double(), + area_ha = readr::col_double(), + county = readr::col_character(), + model_output = readr::col_character() + ) + ) + + delta_county <- deltas |> + dplyr::group_by(model_output, pft, county, ensemble) |> + dplyr::summarize(total_delta_Mg = sum(delta_total_c_Mg), total_ha = sum(area_ha), .groups = "drop") |> + dplyr::mutate( + delta_density_Mg_ha = total_delta_Mg / total_ha, + delta_total_Tg = PEcAn.utils::ud_convert(total_delta_Mg, "Mg", "Tg") + ) |> + dplyr::group_by(model_output, pft, county) |> + dplyr::summarize( + mean_delta_density_Mg_ha = mean(delta_density_Mg_ha), + mean_delta_total_Tg = mean(delta_total_Tg), + .groups = "drop" + ) |> + dplyr::right_join(county_boundaries, by = "county") + + combos_delta <- delta_county |> + dplyr::filter(!is.na(model_output), !is.na(pft)) |> + dplyr::distinct(pft, model_output) + + purrr::pwalk( + combos_delta, + function(pft, model_output) { + datp <- dplyr::filter(delta_county, pft == !!pft, model_output == !!model_output) + pft_key <- stringr::str_replace_all(pft, "[^A-Za-z0-9]+", "_") + # density + p_den <- ggplot2::ggplot(datp, ggplot2::aes(geometry = geom, fill = mean_delta_density_Mg_ha)) + + ggplot2::geom_sf(data = county_boundaries, fill = "white", color = "black") + + ggplot2::geom_sf() + + ggplot2::scale_fill_gradient2(low = "royalblue", mid = "white", high = "firebrick", midpoint = 0, na.value = "white") + + ggplot2::theme_minimal() + + ggplot2::labs( + title = paste("Delta Density (start->end)", model_output, "-", pft), + fill = "Delta (Mg/ha)" + ) + ggsave_optimized( + filename = here::here("figures", paste0("county_delta_", pft_key, "_", model_output, "_carbon_density.webp")), + plot = p_den, width = 10, height = 5, units = "in", dpi = 96, bg = "white" + ) + # stock + p_stk <- ggplot2::ggplot(datp, ggplot2::aes(geometry = geom, fill = mean_delta_total_Tg)) + + ggplot2::geom_sf(data = county_boundaries, fill = "white", color = "black") + + ggplot2::geom_sf() + + ggplot2::scale_fill_gradient2(low = "royalblue", mid = "white", high = "firebrick", midpoint = 0, na.value = "white") + + ggplot2::theme_minimal() + + ggplot2::labs( + title = paste("Delta Stock (start->end)", model_output, "-", pft), + fill = "Delta (Tg)" + ) + ggsave_optimized(filename = here::here("figures", paste0("county_delta_", pft_key, "_", model_output, "_carbon_stock.webp")), plot = p_stk, width = 10, height = 5, units = "in", dpi = 96, bg = "white") + } + ) +} else { + PEcAn.logger::logger.warn("downscaled_deltas.csv not found; skipping delta maps") +} diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R new file mode 100644 index 0000000..92ab346 --- /dev/null +++ b/tests/testthat/helper.R @@ -0,0 +1,18 @@ +# testthat helper: configure logging and source all R/ functions before tests + +# Silence PEcAn.logger during tests to reduce noise +level <- PEcAn.logger::logger.setLevel("OFF") +withr::defer(PEcAn.logger::logger.setLevel(level)) + +# Source project functions +r_dir <- here::here("R") +r_scripts <- list.files(r_dir, pattern = "\\.R$", full.names = TRUE) +for (f in r_scripts) { + result <- tryCatch( + source(f, local = TRUE), + error = function(e) { + warning("Failed to source ", basename(f), ": ", conditionMessage(e)) + NULL + } + ) +} diff --git a/tests/testthat/test-combine_mixed_crops.R b/tests/testthat/test-combine_mixed_crops.R new file mode 100644 index 0000000..251f070 --- /dev/null +++ b/tests/testthat/test-combine_mixed_crops.R @@ -0,0 +1,128 @@ +library(testthat) + +test_that("weighted mixing basic case", { + res <- combine_mixed_crops( + woody_value = 100, annual_value = 50, + annual_cover = 0.2, woody_cover = 0.8, + method = "weighted" + ) + expected <- 0.8 * 100 + 0.2 * 50 + expect_true(abs(res - expected) < 1e-10) +}) + +test_that("weighted cover sum mismatch yields NA output", { + res <- combine_mixed_crops( + woody_value = 100, annual_value = 50, + annual_cover = 0.25, woody_cover = 0.8, + method = "weighted" + ) + expect_true(is.na(res)) +}) + +test_that("incremental mixing basic case", { + res <- combine_mixed_crops( + woody_value = 200, annual_value = 220, annual_init = 200, + annual_cover = 0.3, woody_cover = 1.0, method = "incremental" + ) + expected <- 200 + 0.3 * (220 - 200) + expect_true(abs(res - expected) < 1e-10) +}) + +test_that("incremental requires annual_init", { + testthat::expect_error( + res <- combine_mixed_crops( + woody_value = 200, annual_value = 220, + annual_cover = 0.3, woody_cover = 1.0, + method = "incremental" + ), + "annual_init is required but missing" + ) +}) + +test_that("incremental requires woody_cover == 1", { + testthat::expect_error( + res <- combine_mixed_crops( + woody_value = 200, annual_value = 220, annual_init = 200, + annual_cover = 0.3, woody_cover = 0.9, + method = "incremental" + ), + "woody_cover must be 1" + ) +}) + +test_that("NA inputs rejected", { + expect_error( + res <- combine_mixed_crops( + woody_value = NA_real_, annual_value = 50, + annual_cover = 0.2, woody_cover = 0.8, + method = "weighted" + ), + "NA values not allowed in inputs" + ) +}) + +test_that("length mismatch rejected", { + expect_error( + res <- combine_mixed_crops( + woody_value = 1:2, annual_value = 1:3, + annual_cover = 0.2, woody_cover = 0.8, + method = "weighted" + ), + "Can't recycle" + ) +}) + +test_that("weighted mixing works with vector inputs", { + res <- combine_mixed_crops( + woody_value = c(100, 200, 150), + annual_value = c(50, 100, 75), + annual_cover = 0.2, + woody_cover = 0.8, + method = "weighted" + ) + expected <- 0.8 * c(100, 200, 150) + 0.2 * c(50, 100, 75) + expect_equal(res, expected, tolerance = 1e-10) + expect_length(res, 3) +}) + +test_that("incremental mixing handles delta = 0 (annual_init equals annual_value)", { + res <- combine_mixed_crops( + woody_value = 200, + annual_value = 150, # same as annual_init + annual_init = 150, + annual_cover = 0.5, + woody_cover = 1.0, + method = "incremental" + ) + # delta = 150 - 150 = 0, so result should equal woody_value + expect_equal(res, 200) +}) + +test_that("incremental accepts woody_cover within tolerance of 1", { + # woody_cover = 0.999 should be accepted (within 0.1% of 1) + res <- combine_mixed_crops( + woody_value = 200, + annual_value = 220, + annual_init = 200, + annual_cover = 0.3, + woody_cover = 0.999, # within 1e-3 tolerance + method = "incremental" + ) + expected <- 200 + 0.3 * (220 - 200) + expect_equal(res, expected, tolerance = 1e-10) +}) + +test_that("incremental rejects woody_cover outside tolerance of 1", { + # woody_cover = 0.99 is outside the 0.1% tolerance + expect_error( + combine_mixed_crops( + woody_value = 200, + annual_value = 220, + annual_init = 200, + annual_cover = 0.3, + woody_cover = 0.99, # outside tolerance (1% away) + method = "incremental" + ), + "woody_cover must be 1" + ) +}) \ No newline at end of file diff --git a/tests/testthat/test-match_site_ids.R b/tests/testthat/test-match_site_ids.R new file mode 100644 index 0000000..8a7e5e9 --- /dev/null +++ b/tests/testthat/test-match_site_ids.R @@ -0,0 +1,161 @@ +context("match_site_ids_by_location") +test_that("match_site_ids_by_location returns full mapping when all IDs exist", { + target <- data.frame( + site_id = c("A", "B"), + lat = c(34.0, 35.0), + lon = c(-118.0, -119.0) + ) + reference <- target + + map <- match_site_ids_by_location(target, reference) + expect_s3_class(map, "data.frame") + expect_equal(nrow(map), nrow(target)) + expect_equal(map$target_site_id, target$site_id) + expect_equal(map$matched_site_id, target$site_id) + expect_true(all(map$distance_m == 0)) + + updated <- update_site_ids_by_location(target, reference) + expect_equal(updated$site_id, target$site_id) +}) + +test_that("update_site_ids_by_location replaces missing IDs by nearest", { + # B is missing from reference; it's closest to Y + target <- data.frame( + site_id = c("A", "B"), + lat = c(34.0000, 35.3000), + lon = c(-118.0000, -119.4000) + ) + reference <- data.frame( + site_id = c("A", "X", "Y"), + lat = c(34.0000, 35.3000, 35.3000), + lon = c(-118.0000, -119.5000, -119.4010) # Y is ~90m from B + ) + + map <- match_site_ids_by_location(target, reference) + expect_s3_class(map, "data.frame") + expect_equal(nrow(map), 2) + # A remains A with zero distance + rowA <- map[map$target_site_id == "A", ] + expect_equal(rowA$matched_site_id, "A") + expect_true(rowA$distance_m == 0) + # B maps to Y with positive distance + rowB <- map[map$target_site_id == "B", ] + expect_equal(rowB$matched_site_id, "Y") + expect_true(is.numeric(rowB$distance_m)) + expect_gt(rowB$distance_m, 0) + + updated <- update_site_ids_by_location(target, reference) + expect_equal(updated$site_id, c("A", "Y")) +}) + +test_that("match_site_ids_by_location map_all=TRUE maps all rows and includes close class", { + target <- data.frame( + site_id = c("A", "B"), + lat = c(34.0, 35.0), + lon = c(-118.0, -119.0) + ) + reference <- data.frame( + site_id = c("A", "B"), + lat = c(34.0, 35.0), + lon = c(-118.0, -119.0) + ) + + map_all <- match_site_ids_by_location(target, reference, map_all = TRUE) + expect_s3_class(map_all, "data.frame") + expect_equal(nrow(map_all), 2) + expect_true(all(c("target_site_id", "matched_site_id", "target_lat", "target_lon", "ref_lat", "ref_lon", "distance_m", "close") %in% names(map_all))) + expect_equal(sort(map_all$target_site_id), c("A", "B")) + expect_equal(map_all$target_site_id, map_all$matched_site_id) + # identical coords should produce zero distance, even when map_all=TRUE + expect_true(all(map_all$distance_m == 0)) +}) + +test_that("update_site_ids_by_location supports custom column names", { + target <- data.frame( + id = c("A", "B"), + latitude = c(34.0000, 35.3000), + longitude = c(-118.0000, -119.4000) + ) + reference <- data.frame( + id = c("A", "X", "Y"), + latitude = c(34.0000, 35.3000, 35.3000), + longitude = c(-118.0000, -119.5000, -119.4010) + ) + + updated <- update_site_ids_by_location( + target_df = target, + reference_df = reference, + id_col = "id", + target_lat_col = "latitude", + target_lon_col = "longitude", + reference_id_col = "id", + reference_lat_col = "latitude", + reference_lon_col = "longitude" + ) + expect_equal(updated$id, c("A", "Y")) +}) + +test_that("match_site_ids_by_location errors when exceeding max_distance", { + # Construct a far-away reference so distance >> 100 m + target <- data.frame( + site_id = c("A"), + lat = c(34.0000), + lon = c(-118.0000) + ) + reference <- data.frame( + site_id = c("Z"), + lat = c(35.0000), # ~111 km north + lon = c(-118.0000) + ) + + expect_error( + match_site_ids_by_location(target, reference, map_all = TRUE, max_distance = 100), + regexp = "exceed max_distance|exceed max_distance of" + ) +}) + +test_that("close classification reflects ~90m as 'very close (<=100m)'", { + testthat::skip_if_not_installed("terra") + + target <- data.frame( + site_id = c("A", "B"), + lat = c(34.0000, 35.3000), + lon = c(-118.0000, -119.4000) + ) + reference <- data.frame( + site_id = c("A", "X", "Y"), + lat = c(34.0000, 35.3000, 35.3000), + lon = c(-118.0000, -119.5000, -119.4010) + ) + + map <- match_site_ids_by_location(target, reference) + rowB <- map[map$target_site_id == "B", ] + expect_true(rowB$distance_m > 0) + expect_equal(rowB$close, "very close (<=100m)") +}) + +test_that("map_all computes distances for ID-matched rows", { + testthat::skip_if_not_installed("terra") + + target <- data.frame( + site_id = c("A"), + lat = c(34.0000), + lon = c(-118.0000) + ) + reference <- data.frame( + site_id = c("A"), + lat = c(34.0000), + lon = c(-118.0009) # ~82 m west at this latitude + ) + + map_default <- match_site_ids_by_location(target, reference, map_all = FALSE) + expect_equal(nrow(map_default), 1) + expect_equal(map_default$matched_site_id, "A") + expect_true(map_default$distance_m == 0) + + map_all <- match_site_ids_by_location(target, reference, map_all = TRUE) + expect_equal(nrow(map_all), 1) + expect_equal(map_all$matched_site_id, "A") + expect_true(map_all$distance_m > 0) + expect_equal(map_all$close, "very close (<=100m)") +})