diff --git a/data/anchor_sites_ids.csv b/data/anchor_sites_ids.csv
new file mode 100644
index 0000000..028f6fd
--- /dev/null
+++ b/data/anchor_sites_ids.csv
@@ -0,0 +1,26 @@
+id,lat,lon,location,site_name,distance,geometry
+2e11530b93cd492f,36.177799888824005,-120.2041070387766,US-ASH,San Joaquin Valley Almond High Salinity,0,"c(-18200.4533456107, -204351.896427268)"
+3747151a269036b7,36.94728213363369,-120.10533383829694,US-ASL,San Joaquin Valley Almond Low Salinity,0,"c(-9106.72692414989, -118913.170740419)"
+2e11530b93cd492f,36.177799888824005,-120.2041070387766,US-ASM,San Joaquin Valley Almond Medium Salinity,0,"c(-18200.4533456107, -204351.896427268)"
+e07417e56c78efbc,36.82778534752612,-120.13964104692187,US-PSH,San Joaquin Valley Pistachio High Salinity,0,"c(-12443.4766403388, -132135.913488504)"
+e07417e56c78efbc,36.82778534752612,-120.13964104692187,US-PSL,San Joaquin Valley Pistachio Low Salinity,0,"c(-12443.4766403388, -132135.913488504)"
+278cc612aa84983a,38.099349553216186,-121.5015478519803,US-Bi1,Bouldin Island Alfalfa,0,"c(-131305.841205359, 10246.5206530676)"
+dce435941adb4679,38.109586695737626,-121.53611176665588,US-Bi2,Bouldin Island Corn,0,"c(-134423.030434314, 11397.1114464211)"
+3cea434df9ea5a61,32.81280501889075,-115.44206224718269,US-Dea,UC Desert REC Alfalfa,0,"c(427219.032719703, -567399.727086836)"
+aec9f214083b8f69,38.131541484694665,-121.55191049337776,US-DS1,Staten Corn 1,0,"c(-134720.473528729, 14114.7427923167)"
+713f08eb3322d4df,38.134598947289916,-121.51170668226486,US-DS2,Staten Corn 2,0,"c(-132525.126666226, 14524.1023560269)"
+175358d15d27e2f2,38.12345747466536,-121.54982768148254,US-DS3,Staten Rice 1,0,"c(-135613.932821504, 13017.4923524545)"
+bfee4301f7c99060,36.35733308587667,-119.09282519975774,US-Lin,Lindcove Orchard,0,"c(81360.1815020208, -184101.87965719)"
+86e23fd30a7c0360,39.577102324150175,-121.86224204465502,US-RGB,Butte County Rice Farm,0,"c(-159486.330544182, 175158.013580008)"
+e34fd28a6e4fabee,37.69947277766893,-121.1391060770639,US-RGF,Stanislaus Forage Farm,0,"c(-100102.043437352, -34624.8600835721)"
+62b121132228b0f8,39.59613673857021,-122.02261183095081,US-RGG,Glenn County Rice Farm,0,"c(-173815.733795438, 177251.993701875)"
+7a561525ca4933d3,39.678693479807265,-122.00275036599245,US-RGO,Glenn County Organic Rice,1.48947665615348,"c(-171665.812216293, 186775.251166129)"
+528fcb9581143cf5,38.167462870628164,-121.50580831669866,US-Si1,Staten Island Fallow,0,"c(-131645.501113361, 18645.981438884)"
+38f1738eb5750e06,38.16651300978058,-121.52120421290903,US-Si2,Staten Island Flooded,0,"c(-132693.326336371, 18784.9819894633)"
+5628e37756df9d41,38.10355405637426,-121.6381351773307,US-Tw2,Twitchell Corn,0,"c(-143324.768603639, 10189.6513305134)"
+66de78917c6222b0,38.1157201973455,-121.64507432545884,US-Tw3,Twitchell Alfalfa,0,"c(-144200.168244178, 12239.4602416055)"
+0fb8904477a3d0ce,38.10812386794265,-121.65528241872406,US-Twt,Twitchell Island,0,"c(-144755.572864808, 11526.4416088737)"
+93d63eb396c4fcf9,38.54295166338029,-121.87235457852974,russell-ranch,Russell Ranch,0.20145926374138,"c(-163142.459970614, 60154.8849130729)"
+106ced48ab2599e0,36.34077318361296,-120.10544017924985,west-side,West Side Research Center,55.7382484621095,"c(-9681.32377587362, -186108.326980472)"
+04681ffea182d977,38.50242041442077,-121.97641993262302,wolfskill,Wolfskill Experimental Orchards,0,"c(-172200.763852519, 55891.621553943)"
+9c562bb603f3b156,33.96726803332844,-117.34048733581272,ucr-citrus,UC Riverside Citrus Research,0,"c(245813.938552732, -446378.780685794)"
diff --git a/data/design_points.csv b/data/design_points.csv
new file mode 100644
index 0000000..7964aba
--- /dev/null
+++ b/data/design_points.csv
@@ -0,0 +1,101 @@
+id,lat,lon
+84389254458b3a58,39.15317154882152,-120.95862855945161
+9e3b1749179a4a68,37.99925059252099,-121.20417111702339
+729a6ef989c5a277,35.96903914573576,-118.98552352403604
+8c23aa17d5b211ed,37.23624485456503,-120.325416895509
+57ea98e608333b76,36.9069695937337,-121.68079166613182
+6c1e7a1eb4c36ea5,36.376933120111964,-119.45050526430943
+5b18c5e0bebe449b,34.472437079253254,-119.22014978387774
+058d96dea56318e8,39.845445833262076,-121.99765987838188
+df703141000c59c9,33.90118525984266,-117.4062351725988
+b8b2dd94a7c44662,33.353055572072144,-117.19182032970713
+dbb2c95a5cb30b5d,35.05617905596573,-119.27349643104922
+32b66a2605ee700d,36.62790357625534,-119.55109778115332
+4c3498f22c47aa3f,36.473246624392274,-119.58999372486372
+6def26440f10476c,36.55241300729422,-119.45881344136238
+59d94ec22335aba5,40.112945252187885,-122.12625218266525
+04268ba7e72d7bfb,36.04216376334651,-119.36168176985615
+f0c5457e0db29788,36.18711363649504,-118.99794513559745
+69151056c07813d8,36.01650482108036,-119.12213181961096
+096c9eebd28d0280,34.91295471748269,-120.40345170635351
+ebb783e86d2ac6fb,33.5784705873275,-116.03157176577903
+cf0be21778f4dfa5,36.64072986620738,-119.42861772753282
+b68880e61d448b93,36.628935713055334,-119.6304967981404
+1cb62b1edd188368,37.62449120008097,-120.54849914677672
+92e5e100e1a3a472,39.685053412615225,-122.3378033660457
+148e91e1a4352adb,39.883929087235735,-122.17230280623504
+e664ce5ac2fb2b6f,39.91336805758695,-122.11623004432802
+b33f7db23b64a377,37.95408699615648,-121.21347717890643
+268a52869b01fe54,36.703002551108156,-120.23482871624192
+bd841534f8883a00,36.79087105078594,-120.42721632875711
+60f846ca525da31e,36.557282635126995,-119.2223647492762
+e5d633da3758c491,38.698241380798265,-122.82083682384803
+ce81d5b03685a33e,36.48936289567316,-119.44392501284031
+ae81f35c7ed7c3f1,37.58450083241338,-121.28798414625811
+01609970faa3c5b9,38.247486225652,-122.1132785251325
+08da82da7c956979,38.10501623512874,-121.2107110261319
+d1dd970659a71943,34.37258472945189,-119.03318351437721
+39b7dba5debef188,36.643806231195825,-119.28748125011076
+8b27eb5fc60d506c,36.25395494422166,-119.26318761718049
+29baab2f12e2fd80,36.62869595451381,-119.55927338546941
+7f1efab0660af0a9,38.29195260046744,-121.54984763089621
+9c19b1bcbd391e00,36.665696469841485,-120.07237592351424
+462a123c3840df4e,36.88372637787987,-119.96643408398732
+2bd084fb5a916420,37.52907547714805,-121.1363874822378
+a05ea26a797f4bad,36.74201351767665,-119.66029084888854
+6fe9d9756f962d11,38.999929717176904,-122.85197002568268
+8c37c59148a2b480,36.950873550214546,-121.79638001291717
+cf9c2d9309d78058,35.656293805156665,-119.9791657782526
+765ad4a2cb597b04,37.61727674898043,-120.85985117950787
+d69343ed38ea49c7,35.93301364948056,-119.0188799294081
+c8ca707e38904560,37.430698407931075,-120.825633670933
+984a04e7d14a80a3,37.378490673407306,-120.70411638617661
+cf1c223d4e408116,37.07745477447745,-120.44392158740519
+11aa82e47c15b8df,34.38595949634103,-118.81445533388944
+f5cde95be18bb83d,39.61336729923421,-121.84504941875572
+2c9f5faf24bff9fd,36.139246198074844,-119.10524906004886
+30d7e9a739591579,38.694890771466596,-122.04115169522713
+889d6cb7440babd8,39.14835505068194,-121.78504578064799
+7afa2a6c097c1f5b,36.362595802193916,-119.10984333853257
+7b2e85c30ecfcbe8,36.036952558710894,-118.96561932450625
+ecc585936d3203a5,36.63049097515202,-119.58155869596567
+a860cdc63cfc7cd1,35.98506301531852,-119.01458795848025
+17b056351d1484f8,36.738759687243714,-120.03573971347572
+e968e9c8f8574cb2,36.383833240819285,-119.65554268083451
+000f186bcf4b2be5,37.29108282118807,-121.04882363442482
+54d2ebadd8e4e8e2,37.491197984797665,-121.0085531562697
+6616b64433b71c3c,35.772843204255615,-119.09087891682833
+1114b0b499d4854c,36.51520247850828,-119.42062406057501
+4032fbd64a0b5147,36.76356458613677,-119.50957323217338
+6328b22167f31aad,33.86884047977172,-117.40837842690881
+9a28cf783448dc03,34.29707712580956,-119.06013978221256
+25d92d00fe5f31ea,35.99612889983196,-119.13104338547221
+254f091dee798a22,39.867933961656206,-122.16830730681002
+f87888e9df3fab14,39.70189739031584,-121.79205467074456
+05ce4008f973ff67,36.49534520970872,-119.5917993311476
+132df615713a9102,36.96032567057031,-121.37833422008488
+64cf20b7075f16bd,36.28704555525085,-119.12755106080951
+8c469dfef3eac688,39.73425797216054,-122.13900242983709
+d4aa2f4b173e52d9,39.894655731543544,-122.218282577988
+7a8855e8f27579dd,36.82200408868894,-120.01066959023188
+02ed5f046523985e,37.66867911623095,-121.71245595264062
+3bde8bc179b8fcae,36.90691122790127,-119.67871974793037
+5e21d98f69f14285,35.49646210616489,-119.7232326291538
+5aabe8a4b61ac4b1,37.70219947002282,-120.98628447816145
+ba5921d796f688e5,39.54046659749015,-122.22839029392931
+bf27f77fb0ee40e0,39.14372601708733,-121.57068428157207
+4c49ef413147e623,38.50692937324865,-121.97470253705194
+427606829eff9a98,36.60051373911106,-119.25963820359836
+24766015396f3dde,36.63384726465495,-119.49151662644384
+95af7876982dad47,39.667825249028695,-122.25810998713746
+e3daa31507eccfd0,39.16472984057072,-121.76550564744215
+b609ae97085420fb,37.87291873963135,-121.30336862647805
+1e113c95a1054df5,38.01749742979522,-121.19637437982693
+d6ce58a349148d32,36.20587304779096,-120.25336201321787
+2e11530b93cd492f,36.177799888824005,-120.2041070387766
+3747151a269036b7,36.94728213363369,-120.10533383829694
+2e11530b93cd492f,36.177799888824005,-120.2041070387766
+e07417e56c78efbc,36.82778534752612,-120.13964104692187
+e07417e56c78efbc,36.82778534752612,-120.13964104692187
+bfee4301f7c99060,36.35733308587667,-119.09282519975774
+9c562bb603f3b156,33.96726803332844,-117.34048733581272
diff --git a/data_raw/anchor_sites.csv b/data_raw/anchor_sites.csv
new file mode 100644
index 0000000..1caf342
--- /dev/null
+++ b/data_raw/anchor_sites.csv
@@ -0,0 +1,26 @@
+external_site_id,site_name,lat,lon,crops,pft
+US-ASH,San Joaquin Valley Almond High Salinity,36.1697,-120.201,Almond,woody perennial crop
+US-ASL,San Joaquin Valley Almond Low Salinity,36.9466,-120.1024,Almond,woody perennial crop
+US-ASM,San Joaquin Valley Almond Medium Salinity,36.1777,-120.2026,Almond,woody perennial crop
+US-PSH,San Joaquin Valley Pistachio High Salinity,36.2347,-119.9247,Pistachio,woody perennial crop
+US-PSL,San Joaquin Valley Pistachio Low Salinity,36.8276,-120.1397,Pistachio,woody perennial crop
+US-Bi1,Bouldin Island Alfalfa,38.0992,-121.4993,Alfalfa,hay and haylage crops
+US-Bi2,Bouldin Island Corn,38.1091,-121.5351,Corn,row crops
+US-Dea,UC Desert REC Alfalfa,32.8136,-115.4423,Alfalfa,hay and haylage crops
+US-DS1,Staten Corn 1,38.1335,-121.539,Corn,row crops
+US-DS2,Staten Corn 2,38.1375,-121.514,Corn,row crops
+US-DS3,Staten Rice 1,38.1235,-121.549,Rice,row crops
+US-Lin,Lindcove Orchard,36.3566,-119.0922,Oranges,woody perennial crop
+US-RGB,Butte County Rice Farm,39.5782,-121.8579,Rice,row crops
+US-RGF,Stanislaus Forage Farm,37.6995,-121.1369,Forage crops,hay and haylage crops
+US-RGG,Glenn County Rice Farm,39.5944,-122.0253,Rice,row crops
+US-RGO,Glenn County Organic Rice,39.6805,-122.0026,Organic rice,row crops
+US-Si1,Staten Island Fallow,38.1747,-121.5047,Fallow,NA
+US-Si2,Staten Island Flooded,38.1758,-121.5167,Rice,row crops
+US-Tw2,Twitchell Corn,38.0969,-121.6365,Corn,row crops
+US-Tw3,Twitchell Alfalfa,38.1152,-121.6469,Alfalfa,herbaceous perennial
+US-Twt,Twitchell Island,38.1087,-121.6531,NA,NA
+russell-ranch,Russell Ranch,38.543,-121.874,"tomato, corn, wheat",row crops
+west-side,West Side Research Center,36.342,-120.108,Row crops including cotton,row crops
+wolfskill,Wolfskill Experimental Orchards,38.503,-121.977,Fruit and nut breeding,woody perennial crop
+ucr-citrus,UC Riverside Citrus Research,33.967,-117.34,Citrus variety collection,woody perennial crop
diff --git a/downscale/.future.R b/downscale/.future.R
new file mode 100644
index 0000000..de374d2
--- /dev/null
+++ b/downscale/.future.R
@@ -0,0 +1,10 @@
+# This file will load any time the future package is loaded.
+# see ?future::plan for more details
+
+# Auto-detect cores and leave one free
+no_cores <- max(future::availableCores( ) - 1, 1)
+future::plan(future::multicore, workers = no_cores)
+
+PEcAn.logger::logger.info(paste("Using", no_cores, "cores for parallel processing"))
+
+PEcAn.logger::logger.warn(paste("Using", no_cores, "cores for parallel processing"))
\ No newline at end of file
diff --git a/downscale/00-prepare.R b/downscale/00-prepare.R
new file mode 100644
index 0000000..5afb7f4
--- /dev/null
+++ b/downscale/00-prepare.R
@@ -0,0 +1,386 @@
+#' ---
+#' title: "Workflow Setup and Data Preparation"
+#' author: "David LeBauer"
+#' ---
+#' 
+#' 
+#' # Overview
+#' 
+#' - Prepare Inputs
+#'   - Harmonized LandIQ dataset of woody California cropland from 2016-2023
+#'   - SoilGrids soil properties (clay, ?)
+#'   - CalAdapt climatology (mean annual temperature, mean annual precipitation)
+#' - Use LandIQ to query covariates from SoilGrids and CalAdapt and create a table that includes crop type, soil properties, and climatology for each woody crop field
+#' 
+#' ## TODO
+#' 
+#' - Use consistent projection(s):
+#'   - California Albers EPSG:33110 for joins and spatial operations
+#'   - WGS84 EPSG:4326 for plotting, subsetting rasters?
+#' - Clean up domain code
+#' - Create a bunch of tables and join all at once at the end
+#' - Disambiguate the use of 'ca' in object names; currently refers to both California and Cal-Adapt
+#' - decide if we need both Ameriflux shortnames and full names in anchor_sites.csv
+#'    (prob. yes, b/c both are helpful); if so, come up w/ better name than 'location'
+#' - make sure anchor_sites_ids.csv fields are defined in README 
+library(tidyverse)
+library(sf)
+library(terra)
+library(caladaptr)
+
+# moved to 000-config.R?
+data_dir <- "/projectnb2/dietzelab/ccmmf/data"
+raw_data_dir <- "/projectnb2/dietzelab/ccmmf/data_raw"
+ca_albers_crs <- 3310
+
+PEcAn.logger::logger.info("Starting Data Preparation Workflow")
+
+#source(here::here("downscale", "000-config.R"), local = TRUE)
+#' ### LandIQ Woody Polygons
+#' 
+#' The first step is to convert LandIQ to a open and standard (TBD) format.
+#' 
+#' We will use a GeoPackage file to store geospatial information and 
+#' associated CSV files to store attributes associated with the LandIQ fields.
+#' 
+#' The `site_id` field is the unique identifier for each field.
+#' 
+#' The landiq2std function will be added to the PEcAn.data.land package, and has been implemented in a Pull Request https://github.com/PecanProject/pecan/pull/3423. The function is a work in progress. Two key work to be done. First `landiq2std` does not currently perform all steps to get from the original LandIQ format to the standard format - some steps related to harmonizing LandIQ across years have been completed manually. Second, the PEcAn 'standard' for such data is under development as we migrate from a Postgres database to a more portable GeoPackage + CSV format.
+#' 
+## Convert SHP to Geotiff`
+
+input_file = file.path(raw_data_dir, 'i15_Crop_Mapping_2016_SHP/i15_Crop_Mapping_2016.shp')
+ca_fields_gpkg <- file.path(data_dir, 'ca_fields.gpkg')
+ca_attributes_csv = file.path(data_dir, 'ca_field_attributes.csv')
+if(!file.exists(ca_fields_gpkg) & !file.exists(ca_attributes_csv)) {
+  landiq2std(input_file, ca_fields_gpkg, ca_attributes_csv) # if landiq2std isnt available, see software section of README
+  PEcAn.logger::logger.info(paste0("Created ca_fields.gpkg and ca_field_attributes.csv in ", data_dir))
+} else {
+   PEcAn.logger::logger.info("ca_fields.gpkg and ca_field_attributes.csv already exist in ", data_dir)
+}
+
+ca_fields <- sf::st_read(ca_fields_gpkg) |>
+  sf::st_transform(crs = ca_albers_crs)
+ca_attributes <- readr::read_csv(ca_attributes_csv)
+
+#' ##### Subset Woody Perennial Crop Fields
+#'
+#' Phase 1 focuses on Woody Perennial Crop fields.
+#'
+#' Next, we will subset the LandIQ data to only include woody perennial crop fields.
+#' At the same time we will calculate the total percent of California Croplands that are woody perennial crop.
+#'
+## -----------------------------------------------------------------------------
+
+ca_woody_gpkg <- file.path(data_dir, 'ca_woody.gpkg')
+if(!file.exists(ca_woody_gpkg)) {
+  ca_fields |>
+    left_join(
+      ca_attributes |>
+        filter(pft == "woody perennial crop") |>
+        select(site_id, pft),
+      by = c("site_id")
+    ) |>
+    sf::st_transform(crs = ca_albers_crs) |>
+    dplyr::select(site_id, geom) |>
+    sf::st_write(
+      file.path(data_dir, 'ca_woody.gpkg'),
+      delete_dsn = TRUE
+    )
+  PEcAn.logger::logger.info("Created ca_woody.gpkg with woody perennial crop fields in ", data_dir)
+} else {
+  PEcAn.logger::logger.info("ca_woody.gpkg already exists in ", data_dir)
+}
+
+#' ### Create California bounding box and polygon for clipping
+#'  
+
+#' ### Convert Polygons to Points.
+#'
+#' For Phase 1, we will use points to query raster data.
+#' In later phases we will evaluate the performance of polygons and how querying environmental data using polygons will affect the performance of clustering and downscaling algorithms.
+#'
+
+ca_fields_pts <- ca_fields |>
+  dplyr::select(-lat, -lon) |>
+  left_join(ca_attributes, by = "site_id") |>
+  sf::st_centroid() |>
+  # and keep only the columns we need
+  dplyr::select(site_id, crop, pft, geom)
+
+#' 
+#' ## Environmental Covariates
+#' 
+#' ### SoilGrids
+#' 
+#' #### Load Prepared Soilgrids GeoTIFF
+#' 
+#' Using already prepared SoilGrids layers. 
+#' TODO: move a copy of these files to data_dir
+#' 
+## ----load-soilgrids-----------------------------------------------------------
+soilgrids_north_america_clay_tif <- '/projectnb/dietzelab/dongchen/anchorSites/NA_runs/soil_nc/soilgrids_250m/clay/clay_0-5cm_mean/clay/clay_0-5cm_mean.tif'
+soilgrids_north_america_ocd_tif <- '/projectnb/dietzelab/dongchen/anchorSites/NA_runs/soil_nc/soilgrids_250m/ocd/ocd_0-5cm_mean/ocd/ocd_0-5cm_mean.tif'
+## if we want to clip to CA
+## use terra to read in that file and then extract values for each location
+
+soilgrids_north_america_clay_rast <- terra::rast(soilgrids_north_america_clay_tif)
+soilgrids_north_america_ocd_rast <- terra::rast(soilgrids_north_america_ocd_tif)
+
+#' 
+#' #### Extract clay and carbon stock from SoilGrids
+#' 
+## ----sg-clay-ocd--------------------------------------------------------------
+
+clay <- terra::extract(
+  soilgrids_north_america_clay_rast,
+  terra::vect(ca_fields_pts |>
+    sf::st_transform(crs = sf::st_crs(soilgrids_north_america_clay_rast)))) |>
+  dplyr::select(-ID) |>
+  dplyr::pull() / 10
+
+ocd <- terra::extract(
+  soilgrids_north_america_ocd_rast,
+  terra::vect(ca_fields_pts |>
+    sf::st_transform(crs = sf::st_crs(soilgrids_north_america_ocd_rast)))) |>
+  dplyr::select(-ID) |>
+  dplyr::pull()
+
+ca_fields_pts_clay_ocd <- cbind(ca_fields_pts,
+                               clay = clay,
+                               ocd = ocd)
+
+#' 
+#' ### Topographic Wetness Index
+#' 
+## ----twi----------------------------------------------------------------------
+twi_tiff <- '/projectnb/dietzelab/dongchen/anchorSites/downscale/TWI/TWI_resample.tiff'
+twi_rast <- terra::rast(twi_tiff) 
+
+twi <- terra::extract(
+  twi_rast,
+  terra::vect(ca_fields_pts |>
+    sf::st_transform(crs = sf::st_crs(twi_rast)))) |>
+  dplyr::select(-ID) |>
+  dplyr::pull()
+
+ca_fields_pts_clay_ocd_twi <- cbind(ca_fields_pts_clay_ocd, twi = twi)
+
+#' 
+#' ### ERA5 Met Data
+#' 
+## -----------------------------------------------------------------------------
+era5met_dir <- "/projectnb/dietzelab/dongchen/anchorSites/NA_runs/GridMET/"
+
+# List all ERA5_met_*.tiff files for years 2012-2021
+raster_files <- list.files(
+  path = era5met_dir,
+  pattern = "^ERA5_met_\\d{4}\\.tiff$",
+  full.names = TRUE
+)
+
+# Read all rasters into a list of SpatRaster objects
+rasters_list <- purrr::map(
+  raster_files,
+  ~ terra::rast(.x))
+
+years <- purrr::map_chr(rasters_list, ~ {
+  source_path <- terra::sources(.x)[1]
+  stringr::str_extract(source_path, "\\d{4}")
+})  |>
+  as.integer()
+
+names(rasters_list) <- years
+
+extract_clim <- function(raster, points_sf) {
+  terra::extract(
+    raster, 
+    points_sf |> 
+      sf::st_transform(crs = sf::st_crs(raster))
+    ) |>
+    tibble::as_tibble() |>
+    select(-ID) |>
+    mutate(site_id = points_sf$site_id) |>
+    select(site_id, temp, prec, srad, vapr)
+}
+
+.tmp <-  rasters_list |>
+  furrr::future_map_dfr(
+    ~ extract_clim(.x, ca_fields_pts),
+      .id = "year"
+      )
+
+clim_summaries <- .tmp |>
+  dplyr::mutate(
+    precip = PEcAn.utils::ud_convert(prec, "second-1", "year-1")
+) |>
+  dplyr::group_by(site_id) |>
+  dplyr::summarise(
+    temp = mean(temp),
+    precip = mean(precip),
+    srad = mean(srad),
+    vapr = mean(vapr)
+  )
+
+#' 
+## ----join_and_subset----------------------------------------------------------
+.all <- clim_summaries  |>
+  dplyr::left_join(ca_fields_pts_clay_ocd_twi, by = "site_id")
+
+assertthat::assert_that(
+  nrow(.all) == nrow(clim_summaries) &&
+    nrow(.all) == nrow(ca_fields_pts_clay_ocd_twi),
+  msg = "join was not 1:1 as expected"
+)
+
+#' Append CA Climate Region
+#'
+## Add Climregions
+# load climate regions for mapping
+#' ### Cal-Adapt Climate Regions
+#'
+#' Climate Region will be used as a factor
+#' in the hierarchical clustering step. 
+## ----caladapt_climregions-----------------------------------------------------
+
+ca_field_climregions <- ca_fields |>
+  sf::st_join(
+    caladaptr::ca_aoipreset_geom("climregions") |>
+      sf::st_transform(crs = ca_albers_crs),
+    join = sf::st_within
+  ) |>
+  dplyr::select(
+    site_id,
+    climregion_id = id,
+    climregion_name = name
+  )
+
+# This returns a point geometry. 
+# To return the **polygon** geometry from ca_fields, 
+# drop geometry from .all instead of from ca_field_climregions
+.all2 <- .all |>
+  dplyr::left_join(
+    ca_field_climregions  |> st_drop_geometry(),
+    by = "site_id"
+  )
+
+site_covariates <- .all2 |>
+  na.omit() |>
+  mutate(across(where(is.numeric), ~ signif(., digits = 3))) 
+
+PEcAn.logger::logger.info(
+  round(100 * (1 - nrow(site_covariates) / nrow(ca_fields)), 0), "% of LandIQ polygons (sites) have at least one missing environmental covariate"
+)
+
+# takes a long time
+# knitr::kable(skimr::skim(site_covariates))
+
+readr::write_csv(site_covariates, file.path(data_dir, "site_covariates.csv"))
+
+#'
+#' ## Anchor Sites
+#'
+## ----anchor-sites-------------------------------------------------------------
+# Anchor sites from UC Davis, UC Riverside, and Ameriflux.
+# TODO rename raw_data/anchor_sites --> anchor_locations.csv 
+anchor_sites <- readr::read_csv("data_raw/anchor_sites.csv")
+anchor_sites_pts <- anchor_sites |>
+  sf::st_as_sf(coords = c("lon", "lat"), crs = 4326) |>
+  sf::st_transform(crs = ca_albers_crs)
+
+# Join with ca_fields: keep only the rows associated with anchor sites
+# spatial join find ca_fields that contain anchor site points
+#     takes ~ 1 min on BU cluster w/ "Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz"
+
+# Note: in the next step, pfts are removed from the anchorsites dataframe 
+# and kept pfts from site_covariates 
+# the ones in the sites_covariates were generated by the landiq2std function
+# TODO we will need to make sure that pfts are consistent b/w landiq and anchorsites
+# by identifying and investigating discrepancies
+
+# First subset ca_fields to only include those with covariates
+
+ca_fields_with_covariates <- ca_fields |>
+  dplyr::right_join(site_covariates |> select(site_id), by = "site_id")
+
+anchor_sites_with_ids <- anchor_sites_pts |>
+  sf::st_join(ca_fields_with_covariates,
+    join = sf::st_within
+  )
+
+# Handle unmatched anchor sites
+unmatched_anchor_sites <- anchor_sites_with_ids |>
+  dplyr::filter(is.na(site_id))
+matched_anchor_sites <- anchor_sites_with_ids |>
+  dplyr::filter(!is.na(site_id))
+
+if (nrow(unmatched_anchor_sites) > 0) {
+  # TODO Consider if it is more efficient and clear to match all anchor sites using 
+  # st_nearest_feature rather than st_within
+  nearest_indices <- sf::st_nearest_feature(unmatched_anchor_sites, ca_fields)
+
+  # Get nearest ca_fields
+  nearest_ca_fields <- ca_fields |> dplyr::slice(nearest_indices)
+
+  # Assign site_id and calculate distances
+  unmatched_anchor_sites <- unmatched_anchor_sites |>
+    dplyr::mutate(
+      site_id = nearest_ca_fields$site_id,
+      lat = nearest_ca_fields$lat,
+      lon = nearest_ca_fields$lon,
+      distance_m = sf::st_distance(geometry, nearest_ca_fields, by_element = TRUE)
+    )
+  threshold <- units::set_units(250, "m")
+  if (any(unmatched_anchor_sites$distance_m > threshold)) {
+    PEcAn.logger::logger.warn(
+      "The following anchor sites are more than 250 m away from the nearest landiq field:",
+      paste(unmatched_anchor_sites |> filter(distance_m > threshold) |> pull(site_name), collapse = ", "),
+      "Please check the distance_m column in the unmatched_anchor_sites data.",
+      "Consider dropping these sites or expanding the threshold."
+    )
+  }
+
+  # Combine matched and unmatched anchor sites
+  anchor_sites_with_ids <- dplyr::bind_rows(
+    matched_anchor_sites,
+    unmatched_anchor_sites |> select(-distance_m)
+  )
+}
+
+# Check for missing site_id, lat, or lon
+if (any(is.na(anchor_sites_with_ids |> select(site_id, lat, lon)))) {
+  PEcAn.logger::logger.warn(
+    "Some anchor sites **still** have missing site_id, lat, or lon!"
+  )
+}
+
+# Check for anchor sites with any covariate missing
+n_missing <- anchor_sites_with_ids |>
+  left_join(site_covariates, by = "site_id") |>
+  dplyr::select(
+    site_id, lat, lon,
+    clay, ocd, twi, temp, precip
+  ) |>
+  filter(if_any(
+    everything(),
+    ~ is.na(.x)
+  )) |> nrow()
+
+if (n_missing > 0) {
+  PEcAn.logger::logger.warn(
+    "Some anchor sites have missing environmental covariates!"
+  )
+}
+
+
+# Save processed anchor sites
+anchor_sites_with_ids |>
+  sf::st_drop_geometry() |>
+  dplyr::select(site_id, lat, lon, external_site_id, site_name, crops, pft) |>
+  # save lat, lon with 5 decimal places 
+  dplyr::mutate(
+    lat = round(lat, 5),
+    lon = round(lon, 5)
+  ) |>
+  readr::write_csv(file.path(data_dir, "anchor_sites.csv"))
+
diff --git a/downscale/01_cluster_and_select_design_points.R b/downscale/01_cluster_and_select_design_points.R
new file mode 100644
index 0000000..8ca3b97
--- /dev/null
+++ b/downscale/01_cluster_and_select_design_points.R
@@ -0,0 +1,412 @@
+#' ---
+#' title: "Cluster and Select Design Points"
+#' author: "David LeBauer"
+#' ---
+#' 
+#' # Overview
+#' 
+#' This workflow will:
+#' 
+#' - Read in a dataset of site environmental data
+#' - Perform K-means clustering to identify clusters
+#' - Select design points for each cluster
+#' - create design_points.csv and anchor_sites.csv
+#' 
+#' 
+#' ## Setup
+#' 
+## ----setup--------------------------------------------------------------------
+# general utilities
+library(tidyverse)
+# spatial
+library(sf)
+library(terra)
+library(caladaptr) # to plot climate regions
+# parallel computing
+library(furrr)
+library(doParallel)
+library(dplyr)
+# analysis
+library(cluster)
+library(factoextra)
+library(pathviewr) #???
+
+# plotting
+library(GGally)
+
+# settings
+set.seed(42)
+ca_albers_crs <- 3310 # use California Albers projection (EPSG:3310) for speed
+data_dir <- "/projectnb/dietzelab/ccmmf/data"
+#'
+#' ## Load Site Environmental Data Covariates
+#'
+#' Environmental data was pre-processed in the previous workflow 00-prepare.qmd.
+#'
+#' Below is a sumary of the covariates dataset
+#'
+#' - site_id: Unique identifier for each polygon
+#' - temp: Mean Annual Temperature from ERA5
+#' - precip: Mean Annual Precipitation from ERA5
+#' - srad: Solar Radiation
+#' - vapr: Vapor pressure deficit
+#' - clay: Clay content from SoilGrids
+#' - ocd: Organic Carbon content from SoilGrids
+#' - twi: Topographic Wetness Index
+
+site_covariates_csv <- file.path(data_dir, "site_covariates.csv")
+site_covariates <- readr::read_csv(site_covariates_csv) 
+
+#' ## Anchor Site Selection
+#'
+#' Load Anchor Sites from UC Davis, UC Riverside, and Ameriflux.
+#'
+## ----anchor-sites-selection---------------------------------------------------
+
+anchor_sites_with_ids <- readr::read_csv(file.path(data_dir, "anchor_sites_ids.csv"))
+
+anchor_sites_pts <- anchor_sites_with_ids |>
+  sf::st_as_sf(coords = c("lon", "lat"), crs = 4326) |>
+  sf::st_transform(crs = ca_albers_crs)
+
+# create map of anchor sites
+ca_climregions <- caladaptr::ca_aoipreset_geom("climregions") |>
+  rename(climregion_name = name, climregion_id = id)
+p <- anchor_sites_pts |>
+  ggplot() +
+  geom_sf(data = ca_climregions, aes(fill = climregion_name), alpha = 0.25) +
+  labs(color = "Climate Region") +
+  geom_sf(aes(color = pft)) +
+  scale_color_brewer(palette = "Dark2") +
+  labs(color = "PFT") +
+  theme_minimal()
+ggsave(p, filename = "downscale/figures/anchor_sites.png", dpi = 300, bg = "white")
+
+anchorsites_for_clust <-
+  anchor_sites_with_ids |>
+  select(-pft) |>                                             
+  left_join(site_covariates, by = 'site_id')                  
+  
+  
+#'
+#' ### Subset LandIQ fields for clustering
+#'
+#' The following code does:
+#' - Read in a dataset of site environmental data
+#' - Removes anchor sites from the dataset that will be used for clustering
+#' - Subsample the dataset - 136GB RAM is insufficient to cluster 100k rows
+#' - Bind anchor sites back to the dataset
+#'
+## ----subset-for-clustering----------------------------------------------------
+# 10k works
+# 2k sufficient for testing
+sample_size <- 10000
+
+data_for_clust <- site_covariates |>
+  # remove anchor sites
+  dplyr::filter(!site_id %in% anchorsites_for_clust$site_id) |>
+  # subset to woody perennial crops
+  # dplyr::filter(pft == "woody perennial crop") |>
+  # dplyr::mutate(pft = ifelse(pft == "woody perennial crop", "woody perennial crop", "other")) |>
+  sample_n(sample_size - nrow(anchorsites_for_clust)) |>
+  # now add anchor sites back
+  bind_rows(anchorsites_for_clust) |>
+  dplyr::mutate(
+    crop = factor(crop),
+    climregion_name = factor(climregion_name)
+  ) |>
+  select(-lat, -lon) 
+assertthat::assert_that(nrow(data_for_clust) == sample_size)
+
+PEcAn.logger::logger.info("Summary of data for clustering before scaling:")
+skimr::skim(data_for_clust)
+
+#' 
+#' ### K-means Clustering
+#' 
+#' K-means on the numeric columns of covariates 
+#' Full list printed in message; currently (temp, precip, srad, vapr, clay, ocd, twi)
+#' could include 'crop' as categorical with one-hot encoding) i.e.
+#' crop1 crop2 crop3
+#' 0     1    0
+#' 1     0    0
+#' by changing function below w/ 
+# perform_clustering <- function(data, k_range = 2:20) {
+  # Perform one-hot encoding for the 'crop' variable
+  # encoded_crop <- model.matrix(~ crop - 1, data = data)
+  # clust_data <- data |>
+  #   select(where(is.numeric), -ends_with("id")) |>
+  #   cbind(encoded_crop)
+  # extract metrics
+
+
+perform_clustering <- function(data, k_range = 2:20) {
+  # Select numeric variables for clustering
+  clust_data <- data |>
+    select(where(is.numeric), -ends_with("id"))
+  
+  PEcAn.logger::logger.info(
+    "Columns used for clustering: ",
+    paste(names(clust_data), collapse = ", ")
+  )
+  # Standardize data
+  clust_data_scaled <- scale(clust_data)
+  gc()   # free up memory
+  PEcAn.logger::logger.info("Summary of scaled data used for clustering:")
+  print(skimr::skim(clust_data_scaled))
+  
+  # Determine optimal number of clusters using elbow method
+  metrics_list <- furrr::future_map(
+    k_range,
+    function(k) {
+      model <- hkmeans(clust_data_scaled, k)
+      total_withinss <- model$tot.withinss
+      sil_score <- mean(silhouette(model$cluster, dist(clust_data_scaled))[, 3])
+ #     dunn_index <- 
+ #     calinski_harabasz <- 
+      list(model = model, total_withinss = total_withinss, sil_score = sil_score)
+    },
+    .options = furrr_options(seed = TRUE)
+  )
+
+
+
+  metrics_df <- data.frame(
+    # see also https://github.com/PecanProject/pecan/blob/b5322a0fc62760b4981b2565aabafc07b848a699/modules/assim.sequential/inst/sda_backup/bmorrison/site_selection/pick_sda_sites.R#L221
+    k = k_range,
+    tot.withinss = map_dbl(metrics_list, "total_withinss"),
+    sil_score = map_dbl(metrics_list, "sil_score")
+#    dunn_index = map_dbl(metrics_list, "dunn_index")
+#    calinski_harabasz = map_dbl(metrics_list, "calinski_harabasz")
+  )
+
+  elbow_k <- find_curve_elbow(
+    metrics_df[, c("k", "tot.withinss")],
+    export_type = "k" # default uses row number instead of k
+  )["k"]
+
+## TODO check other metrics (b/c sil and elbow disagree)
+# other metrics
+#  sil_k <- metrics_df$k[which.max(metrics_df$sil_score)]
+#  dunn_k <- metrics_df$k[which.max(metrics_df$dunn_index)]
+#  calinski_harabasz_k <- metrics_df$k[which.max(metrics_df$calinski_harabasz)]
+
+  txtplot::txtplot(
+    x = metrics_df$k, y = metrics_df$tot.withinss, 
+    xlab = "k (number of clusters)",
+    ylab = "SS(Within)"
+  )
+  PEcAn.logger::logger.info(
+    "Optimal number of clusters according to Elbow Method: ", elbow_k, 
+    "(where the k vs ss(within) curve starts to flatten.)"
+  )
+
+  PEcAn.logger::logger.info("Silhouette scores computed. Higher values indicate better-defined clusters.")
+  txtplot::txtplot(
+    x = metrics_df$k, y = metrics_df$sil_score,
+    xlab = "Number of Clusters (k)", ylab = "Score"
+  )  
+
+  # Perform hierarchical k-means clustering with optimal k
+  final_hkmeans <- hkmeans(clust_data_scaled, elbow_k)
+  clust_data <- cbind(
+    site_id = data$site_id,
+    clust_data, 
+    cluster = final_hkmeans$cluster
+  )
+
+  return(clust_data)
+}
+
+#' 
+#' Apply clustering function to the sampled dataset.
+#' 
+## ----clustering, eval=FALSE---------------------------------------------------
+
+PEcAn.logger::logger.info("Starting clustering")
+sites_clustered <- perform_clustering(data_for_clust, k = 5:15)
+
+#'
+#' ### Check Clustering
+#'
+## ----check-clustering---------------------------------------------------------
+# Summarize clusters
+cluster_summary <- sites_clustered |>
+  group_by(cluster) |>
+  summarise(across(where(is.numeric), \(x) mean(x, na.rm = TRUE)))
+
+knitr::kable(cluster_summary, digits = 0)
+
+# Plot all pairwise numeric variables
+ggpairs_plot <- sites_clustered |>
+  select(-site_id) |>
+  # need small # pfts for ggpairs
+  sample_n(1000) |>
+  ggpairs(
+    # plot all values except site_id and cluster
+    columns = setdiff(names(sites_clustered), c("site_id", "cluster")),
+    mapping = aes(color = as.factor(cluster), alpha = 0.8)
+  ) +
+  theme_minimal()
+ggsave(ggpairs_plot,
+  filename = "downscale/figures/cluster_pairs.png",
+  dpi = 300, width = 10, height = 10, units = "in"
+)
+
+# scale and reshape to long for plotting
+
+# Normalize the cluster summary data
+scaled_cluster_summary <- cluster_summary |>
+  mutate(across(-cluster, scale)) |>
+  pivot_longer(
+    cols = -cluster,
+    names_to = "variable",
+    values_to = "value"
+  )
+
+cluster_plot <- ggplot(
+  scaled_cluster_summary,
+  aes(x = factor(variable), y = value)
+) +
+  geom_bar(stat = "identity", position = "dodge") +
+  facet_wrap(~cluster) +
+  coord_flip() +
+  labs(x = "Variable", y = "Normalized Value") +
+  theme_minimal()
+
+ggsave(cluster_plot, filename = "downscale/figures/cluster_plot.png", dpi = 300)
+
+#' 
+#' #### Stratification by Crops and Climate Regions
+#' 
+## ----check-stratification-----------------------------------------------------
+# Check stratification of clusters by categorical factors
+
+# cols should be character, factor
+crop_ids <- read_csv(
+  file.path(data_dir, "crop_ids.csv"),
+  col_types = cols(
+    crop_id = col_factor(),
+    crop = col_character()
+  )
+)
+
+climregion_ids <- read_csv(
+  file.path(data_dir, "climregion_ids.csv"),
+  col_types = cols(
+    climregion_id = col_factor(),
+    climregion_name = col_character()
+  )
+)
+
+## ----stratification-----------------------------------------------------------
+# The goal here is to check the stratification of the clusters by crop and climregion
+# to ensure that the clusters are not dominated by a single crop or climregion
+# BUT probably best to normalize by the total number of fields in each cluster 
+# if this is to be useful
+# is this useful?
+
+# ca_attributes <- read_csv(file.path(data_dir, "ca_field_attributes.csv"))
+# site_covariates <- read_csv(file.path(data_dir, "site_covariates.csv"))
+
+# sites_clustered <- sites_clustered |>
+#   left_join(ca_attributes, by = "site_id") |>
+#   left_join(site_covariates, by = "site_id")
+
+# factor_stratification <- list(
+#     crop_id = table(sites_clustered$cluster, sites_clustered$crop),
+#     climregion_id = table(sites_clustered$cluster, sites_clustered$climregion_name))
+
+# normalize <- function(x) {
+#   round(100 * x / rowSums(x), 1)
+# }
+# normalized_stratification <- lapply(factor_stratification, normalize)
+# lapply(normalized_stratification, knitr::kable)
+
+#' 
+#' ## Design Point Selection
+#' 
+#' For phase 1b we need to supply design points for SIPNET runs. 
+#' For development we will use 100 design points from the clustered dataset that are _not_ already anchor sites.
+#' 
+#' For the final high resolution runs we expect to use approximately 10,000 design points.
+#' For woody croplands, we will start with a number proportional to the total number of sites with woody perennial pfts.
+#'
+#' 
+
+#'
+#' ### How Many Design Points?
+#'
+#' Calculating Woody Cropland Proportion
+#'
+#' Here we calculate percent of California croplands that are woody perennial crops,
+#' in order to estimate the number of design points that will be selected in the clustering step
+#' This is commented out because it takes a while and the number won't change
+## ----woody-proportion---------------------------------------------------------
+ca_attributes <- read_csv(file.path(data_dir, "ca_field_attributes.csv"))
+ca_fields <- sf::st_read(file.path(data_dir, "ca_fields.gpkg"))
+# pft_area <- ca_fields |>
+#   left_join(ca_attributes, by = "site_id") |>
+#   dplyr::select(site_id, pft, area_ha) |>
+#   dtplyr::lazy_dt() |>
+#   dplyr::mutate(woody_indicator = ifelse(pft == "woody perennial crop", 1L, 0L)) |>
+#   dplyr::group_by(woody_indicator) |>
+#   dplyr::summarize(pft_area = sum(area_ha)) |>
+#   # calculate percent of total area
+#   dplyr::mutate(pft_area_pct = pft_area / sum(pft_area) * 100)
+
+# knitr::kable(pft_area, digits = 0)
+# answer: 17% of California croplands were woody perennial crops in the
+# 2016 LandIQ dataset
+# So ... if we want to ultimately have 2000 design points, we should have ~ 400
+# design points for woody perennial crops
+
+
+## ----design-point-selection---------------------------------------------------
+# From the clustered data, remove anchor sites to avoid duplicates in design point selection.
+
+design_points_ids <- sites_clustered |>
+  filter(!site_id %in% anchorsites_for_clust$site_id) |>
+  select(site_id)  |>
+  sample_n(100 - nrow(anchorsites_for_clust))  |>
+  select(site_id)
+
+anchor_site_ids <- anchorsites_for_clust |>
+   select(site_id)
+
+design_points <- bind_rows(design_points_ids,
+   anchor_site_ids)  |>
+   left_join(ca_fields, by = "site_id")
+
+design_points |>
+   as_tibble()  |>
+   select(site_id, lat, lon) |>
+   write_csv(file.path(data_dir, "design_points.csv"))
+
+#' 
+#' ### Design Point Map
+#' 
+#' Now some analysis of how these design points are distributed
+#' 
+## ----design-point-map---------------------------------------------------------
+# plot map of california and climregions
+
+design_points_clust <- design_points |>
+  left_join(sites_clustered, by = "site_id") |>
+  select(site_id, lat, lon, cluster) |>
+  drop_na(lat, lon) |>
+  mutate(cluster = as.factor(cluster)) |>
+  st_as_sf(coords = c("lon", "lat"), crs = 4326)
+
+ca_fields_pts <- ca_fields  |>
+  st_as_sf(coords = c("lon", "lat"), crs = 4326)
+
+design_pt_plot <- ggplot() +
+  geom_sf(data = ca_climregions, fill = 'white') +
+  theme_minimal() +
+  geom_sf(data = ca_fields, fill = "lightgrey", color = "lightgrey", alpha = 0.25) +
+  geom_sf(data = design_points_clust) +
+  geom_text(data = design_points_clust, aes(label = cluster, geometry = geometry), 
+            size = 2, stat = "sf_coordinates")
+
+ggsave(design_pt_plot, filename = "downscale/figures/design_points.png", dpi = 300, bg = "white")
diff --git a/downscale/02_extract_sipnet_output.R b/downscale/02_extract_sipnet_output.R
new file mode 100644
index 0000000..176ccf6
--- /dev/null
+++ b/downscale/02_extract_sipnet_output.R
@@ -0,0 +1,317 @@
+# This file processess the output from SIPNET ensemble runs and generates
+# three different data formats that comply with Ecological Forecasting Initiative 
+# (EFI) standard:
+# 1. A 4-D array (time, site, ensemble, variable)
+# 2. A long format data frame (time, site, ensemble, variable)
+# 3. A NetCDF file (time, site, ensemble, variable)
+# This code can be moved to PEcAn.utils as one or more functions
+# I did this so that I could determine which format is easiest to work with
+# For now, I am planning to work with the CSV format
+# TODO: write out EML metadata in order to be fully EFI compliant
+library(PEcAn.logger)
+library(lubridate)
+library(dplyr)
+library(ncdf4)
+library(furrr)
+library(stringr)
+
+# Define base directory for ensemble outputs
+modeloutdir <- "/projectnb/dietzelab/ccmmf/ccmmf_phase_1b_20250319064759_14859"
+
+# Read settings file and extract run information
+settings <- PEcAn.settings::read.settings(file.path(modeloutdir, "settings.xml"))
+outdir <- file.path(modeloutdir, settings$modeloutdir)
+ensemble_size <- settings$ensemble$size |>
+    as.numeric()
+start_date <- settings$run$settings.1$start.date
+start_year <- lubridate::year(start_date)
+end_date <- settings$run$settings.1$end.date
+end_year <- lubridate::year(end_date)
+
+# Site Information
+# design points for 1b
+# data/design_points.csv
+design_pt_csv <- "https://raw.githubusercontent.com/ccmmf/workflows/46a61d58a7b0e43ba4f851b7ba0d427d112be362/data/design_points.csv"
+design_points <- readr::read_csv(design_pt_csv, show_col_types = FALSE) |>
+    rename(site_id = id) |> # TODO remove; has already been renamed in source file 
+    dplyr::distinct()
+
+# Variables to extract
+variables <- c("AGB", "TotSoilCarb")
+
+#' **Available Variables**
+#' This list is from the YYYY.nc.var files, and may change, 
+#'   e.g. if we write out less information in order to save time and storage space
+#' See SIPNET parameters.md for more details
+#' PEcAn standard units follow Climate Forecasting standards: 
+#' Carbon and water pools are in units of kg / m2
+#' Fluxes are in units of kg / m2 / s-1
+#'
+#' | Variable                      | Description                              |
+#' |-------------------------------|------------------------------------------|
+#' | GPP                           | Gross Primary Productivity               |
+#' | NPP                           | Net Primary Productivity                 |
+#' | TotalResp                     | Total Respiration                        |
+#' | AutoResp                      | Autotrophic Respiration                  |
+#' | HeteroResp                    | Heterotrophic Respiration                |
+#' | SoilResp                      | Soil Respiration                         |
+#' | NEE                           | Net Ecosystem Exchange                   |
+#' | AbvGrndWood                   | Above ground woody biomass               |
+#' | leaf_carbon_content           | Leaf Carbon Content                      |
+#' | TotLivBiom                    | Total living biomass                     |
+#' | TotSoilCarb                   | Total Soil Carbon                        |
+#' | Qle                           | Latent heat                              |
+#' | Transp                        | Total transpiration                      |
+#' | SoilMoist                     | Average Layer Soil Moisture              |
+#' | SoilMoistFrac                 | Average Layer Fraction of Saturation     |
+#' | SWE                           | Snow Water Equivalent                    |
+#' | litter_carbon_content         | Litter Carbon Content                    |
+#' | litter_mass_content_of_water  | Average layer litter moisture            |
+#' | LAI                           | Leaf Area Index                          |
+#' | fine_root_carbon_content      | Fine Root Carbon Content                 |
+#' | coarse_root_carbon_content    | Coarse Root Carbon Content               |
+#' | GWBI                          | Gross Woody Biomass Increment            |
+#' | AGB                           | Total aboveground biomass                |
+#' | time_bounds                   | history time interval endpoints          |
+
+site_ids <- design_points |>
+    pull(site_id) |>
+    unique()
+ens_ids <- 1:ensemble_size
+
+##-----TESTING SUBSET-----##
+# comment out for full run # 
+# site_ids   <- site_ids[1:5]
+# ens_ids    <- ens_ids[1:5]
+# start_year <- end_year - 1
+
+ens_dirs <- expand.grid(ens = PEcAn.utils::left.pad.zeros(ens_ids), 
+                        site_id = site_ids, 
+                        stringsAsFactors = FALSE) |>
+    mutate(dir = file.path(outdir, paste("ENS", ens, site_id, sep = "-")))
+# Check that all ens dirs exist
+existing_dirs <- file.exists(ens_dirs$dir)
+if (!all(existing_dirs)) {
+    missing_dirs <- ens_dirs[!existing_dirs]
+    PEcAn.logger::logger.warn("Missing expected ensemble directories: ", paste(missing_dirs, collapse = ", "))
+}
+
+# extract output via PEcAn.utils::read.output
+# temporarily suppress logging or else it will print a lot of file names
+logger_level <- PEcAn.logger::logger.setLevel("OFF")
+ens_results <- furrr::future_pmap_dfr(
+    ens_dirs,
+    function(ens, site_id, dir) {
+        out_df <- PEcAn.utils::read.output(
+            runid = paste(ens, site_id, sep = "-"),
+            outdir = dir,
+            start.year = start_year,
+            end.year = end_year,
+            variables = variables,
+            dataframe = TRUE,
+            verbose = FALSE
+        ) |>
+            dplyr::mutate(site_id = .env$site_id, ensemble = as.numeric(.env$ens)) |>
+            dplyr::rename(time = posix)
+    },
+    # Avoids warning "future unexpectedly generated random numbers",
+    # which apparently originates from actions taken inside the `units` package
+    # when its namespace is loaded by ud_convert inside read.output.
+    # The warning is likely spurious, but looks scary and setting seed to
+    # silence it does not hurt anything. 
+    # Associated bug report: https://github.com/r-quantities/units/issues/409
+    .options = furrr::furrr_options(seed = TRUE)
+) |>
+    group_by(ensemble, site_id, year) |>
+    #filter(year <= end_year) |> # not sure why this was necessary; should be taken care of by read.output
+    filter(time == max(time)) |> # only take last value
+    ungroup() |>
+    arrange(ensemble, site_id, year)  |> 
+    tidyr::pivot_longer(cols = all_of(variables), names_to = "variable", values_to = "prediction")
+
+# After extraction, ens_results$prediction is in kg C m-2 for both AGB and TotSoilCarb
+
+# restore logging
+logger_level <- PEcAn.logger::logger.setLevel(logger_level)
+
+## Create Ensemble Output For Downscaling
+## Below, three different output formats are created:
+## 1. 4-D array (time, site, ensemble, variable)
+## 2. long format data frame (time, site, ensemble, variable)
+## 3. NetCDF file (time, site, ensemble, variable)
+
+# --- 1. Create 4-D array ---
+# Add a time dimension (even if of length 1) so that dimensions are: [time, site, ensemble, variable]
+unique_times <- sort(unique(ens_results$time))
+if(length(unique_times) != length(start_year:end_year)){
+    # this check may fail if we are using > one time point per year, 
+    # i.e. if the code above including group_by(.., year) is changed
+    PEcAn.logger::logger.warn( 
+        "there should only be one unique time per year",
+        "unless we are doing a time series with multiple time points per year"
+    )
+}
+
+# Create a list to hold one 3-D array per variable
+ens_arrays <- list()
+for (var in variables) {
+    # Preallocate 3-D array for time, site, ensemble for each variable
+    arr <- array(NA,
+        dim = c(length(unique_times), length(site_ids), length(ens_ids)),
+        dimnames = list(
+            datetime = as.character(unique_times),
+            site_id = site_ids,
+            ensemble = as.character(ens_ids)
+        )
+    )
+    
+    # Get rows corresponding to the current variable
+    subset_idx <- which(ens_results$variable == var)
+    if (length(subset_idx) > 0) {
+        i_time <- match(ens_results$time[subset_idx], unique_times)
+        i_site <- match(ens_results$site_id[subset_idx], site_ids)
+        i_ens <- match(ens_results$ensemble[subset_idx], ens_ids)
+        arr[cbind(i_time, i_site, i_ens)] <- ens_results$prediction[subset_idx]
+    }
+    
+    ens_arrays[[var]] <- arr
+}
+
+# ens_arrays: each array is [time, site, ensemble], values in kg C m-2
+
+saveRDS(ens_arrays, file = file.path(outdir, "ensemble_output.rds")) # units: kg C m-2
+
+# --- 2. Create EFI Standard v1.0 long format data frame ---
+efi_long <- ens_results |>
+    rename(datetime = time) |>
+    select(datetime, site_id, ensemble, variable, prediction)
+# NOTE: prediction is in kg C m-2
+
+readr::write_csv(efi_long, file.path(outdir, "ensemble_output.csv"))
+# Consider: write a README or add a comment about units in the CSV
+
+####--- 3. Create EFI Standard v1.0 NetCDF files
+library(ncdf4)
+# Assume these objects already exist (created above):
+#   unique_times: vector of unique datetime strings
+#   design_points: data frame with columns lat, lon, and id (site_ids)
+#   ens_ids: vector of ensemble member numbers (numeric)
+#   ens_arrays: list with elements "AGB" and "TotSoilCarb" that are arrays
+#       with dimensions: datetime, site, ensemble
+
+# Get dimension names / site IDs
+time_char <- unique_times
+
+lat <- design_points |>
+    filter(site_id %in% site_ids) |> # only required when testing w/ subset
+    dplyr::pull(lat)
+lon <- design_points |>
+    filter(site_id %in% site_ids) |>
+    dplyr::pull(lon)
+
+# Convert time to CF-compliant values using PEcAn.utils::datetime2cf
+time_units <- "days since 1970-01-01 00:00:00"
+cf_time <- PEcAn.utils::datetime2cf(time_char, unit = time_units)
+
+# TODO: could accept start year as an argument to the to_ncdim function if variable = 'time'? Or set default? 
+#       Otherwise this returns an invalid dimension 
+# time_dim <- PEcAn.utils::to_ncdim("time", cf_time)
+time_dim <- ncdf4::ncdim_def(
+    name = "ntime",
+    longname = "Time middle averaging period",
+    units = time_units,
+    vals = cf_time,
+    calendar = "standard",
+    unlim = FALSE
+)
+site_dim <- ncdim_def("site", "", vals = seq_along(site_ids), longname = "Site ID", unlim = FALSE)
+ensemble_dim <- ncdim_def("ensemble", "", vals = ens_ids, longname = "ensemble member", unlim = FALSE)
+
+# Use dims in reversed order so that the unlimited (time) dimension ends up as the record dimension:
+dims <- list(time_dim, site_dim, ensemble_dim)
+
+# Define forecast variables:
+agb_ncvar <- ncvar_def(
+    name = "AGB",
+    units = "kg C m-2", # correct units
+    dim = dims,
+    longname = "Total aboveground biomass"
+)
+soc_ncvar <- ncvar_def(
+    name = "TotSoilCarb",
+    units = "kg C m-2", # correct units
+    dim = dims,
+    longname = "Total Soil Carbon"
+)
+time_var <- ncvar_def(
+    name = "time",
+    units = "days since 1970-01-01 00:00:00",
+    dim = time_dim,
+    longname = "Time dimension"
+)
+lat_var <- ncvar_def(
+    name = "lat",
+    units = "degrees_north",
+    dim = site_dim,
+    longname = "Latitude"
+)
+
+lon_var <- ncvar_def(
+    name = "lon",
+    units = "degrees_east",
+    dim = site_dim,
+    longname = "Longitude"
+)
+
+nc_vars <- list(
+    time = time_var,
+    lat  = lat_var,
+    lon  = lon_var,
+    AGB  = agb_ncvar,
+    TotSoilCarb = soc_ncvar
+)
+
+nc_file <- file.path(outdir, "ensemble_output.nc")
+
+if (file.exists(nc_file)) {    
+    file.remove(nc_file)
+}
+
+nc_out <- ncdf4::nc_create(nc_file, nc_vars)
+# Add attributes to coordinate variables for clarity
+# ncdf4::ncatt_put(nc_out, "time", "bounds", "time_bounds", prec = NA)
+# ncdf4::ncatt_put(nc_out, "time", "axis", "T", prec = NA)
+# ncdf4::ncatt_put(nc_out, "site", "axis", "Y", prec = NA)
+# ncdf4::ncatt_put(nc_out, "ensemble", "axis", "E", prec = NA)
+
+# Write data into the netCDF file.
+ncvar_put(nc_out, time_var, cf_time)
+ncvar_put(nc_out, lat_var, lat)
+ncvar_put(nc_out, lon_var, lon)
+ncvar_put(nc_out, agb_ncvar, ens_arrays[["AGB"]])
+ncvar_put(nc_out, soc_ncvar, ens_arrays[["TotSoilCarb"]])
+
+## Add global attributes per EFI standards.
+
+# Get Run metadata from log filename
+# ??? is there a more reliable way to do this?
+forecast_time <- readr::read_tsv(
+    file.path(modeloutdir, 'output', "STATUS"),
+    col_names = FALSE
+) |>
+    filter(X1 == "FINISHED") |>
+    pull(X3)
+forecast_iteration_id <- as.numeric(forecast_time) # or is run_id available?
+obs_flag <- 0
+
+ncatt_put(nc_out, 0, "model_name", settings$model$type) 
+ncatt_put(nc_out, 0, "model_version", settings$model$revision)
+ncatt_put(nc_out, 0, "iteration_id", forecast_iteration_id)
+ncatt_put(nc_out, 0, "forecast_time", forecast_time)
+ncatt_put(nc_out, 0, "obs_flag", obs_flag)
+ncatt_put(nc_out, 0, "creation_date", format(Sys.time(), "%Y-%m-%d"))
+# Close the netCDF file.
+nc_close(nc_out)
+
+PEcAn.logger::logger.info("EFI-compliant netCDF file 'ensemble_output.nc' created.")
+PEcAn.logger::logger.info("All ensemble outputs (RDS, CSV, NetCDF) are in kg C m-2 for AGB and TotSoilCarb.")
diff --git a/downscale/03_downscale_and_aggregate.R b/downscale/03_downscale_and_aggregate.R
new file mode 100644
index 0000000..a7abc9f
--- /dev/null
+++ b/downscale/03_downscale_and_aggregate.R
@@ -0,0 +1,501 @@
+#' ---
+#' Title: Downscale and Aggregate Woody Crop SOC stocks
+#' author: "David LeBauer"
+#' ---
+#'
+
+#'
+#' # Overview
+#'
+#' This workflow will:
+#'
+#' - Use environmental covariates to predict SIPNET estimated SOC for each woody crop field in the LandIQ dataset
+#'   - Uses Random Forest [may change to CNN later] trained on site-scale model runs.
+#'   - Build a model for each ensemble member
+#' - Write out a table with predicted biomass and SOC to maintain ensemble structure, ensuring correct error propagation and spatial covariance.
+#' - Aggregate County-level biomass and SOC inventories
+#'
+## ----setup--------------------------------------------------------------------
+library(tidyverse)
+library(sf)
+library(terra)
+library(furrr)
+library(patchwork) # for combining plots
+
+library(PEcAnAssimSequential)
+datadir <- "/projectnb/dietzelab/ccmmf/data"
+basedir <- "/projectnb/dietzelab/ccmmf/ccmmf_phase_1b_20250319064759_14859"
+settings <- PEcAn.settings::read.settings(file.path(basedir, "settings.xml"))
+outdir <- file.path(basedir, settings$modeloutdir)
+options(readr.show_col_types = FALSE)
+
+
+#' ## Get Site Level Outputs
+ensemble_file <- file.path(outdir, "efi_ens_long.csv")
+ensemble_data <- readr::read_csv(ensemble_file) 
+
+#' ### Random Forest using PEcAn downscale workflow
+## -----------------------------------------------------------------------------
+design_pt_csv <- "https://raw.githubusercontent.com/ccmmf/workflows/46a61d58a7b0e43ba4f851b7ba0d427d112be362/data/design_points.csv"
+design_points <- read_csv(design_pt_csv) |> #read_csv(here::here("data/design_points.csv")) |>
+  dplyr::distinct()
+
+covariates_csv <- file.path(datadir, "site_covariates.csv")
+covariates <- read_csv(covariates_csv) |>
+  select(
+    site_id, where(is.numeric),
+    -climregion_id
+  )
+
+downscale_carbon_pool <- function(date, carbon_pool) {
+  filtered_ens_data <- subset_ensemble(
+    ensemble_data = ensemble_data,
+    site_coords   = design_points,
+    date          = date,
+    carbon_pool   = carbon_pool
+  )
+
+  # Downscale the data
+  downscale_output <- ensemble_downscale(
+    ensemble_data = filtered_ens_data,
+    site_coords   = design_points,
+    covariates    = covariates,
+    seed          = 123
+  )
+  return(downscale_output)
+}
+
+cpools <- c("TotSoilCarb", "AGB")
+
+downscale_output_list <- purrr::map( # not using furrr b/c it is used inside downscale
+  cpools,
+  ~ downscale_carbon_pool(date = "2018-12-31", carbon_pool = .x)
+) |>
+  purrr::set_names(cpools)
+
+## Check variable importance
+
+## Save to make it easier to restart
+# saveRDS(downscale_output, file = here::here("cache/downscale_output.rds"))
+PEcAn.logger::logger.info("Downscaling complete")
+
+PEcAn.logger::logger.info("Downscaling model results for each ensemble member:")
+metrics <- lapply(downscale_output_list, downscale_metrics)
+knitr::kable(metrics)
+
+median_metrics <- purrr::map(metrics, function(m) {
+  m |>
+    select(-ensemble) |>
+    summarise(#do equivalent of colmeans but for medians)
+      across(
+        everything(),
+        list(median = ~ median(.x)),
+        .names = "{col}"
+      )
+    )
+})
+
+PEcAn.logger::logger.info("Median downscaling model metrics:")
+bind_rows(median_metrics, .id = "carbon_pool") |>
+  knitr::kable()
+
+#'
+#'
+#' ## Aggregate to County Level
+#'
+## -----------------------------------------------------------------------------
+
+# ca_fields <- readr::read_csv(here::here("data/ca_field_attributes.csv")) |>
+#   dplyr::select(id, lat, lon) |>
+#   rename(site = id)
+PEcAn.logger::logger.info("Aggregating to County Level")
+
+ca_fields_full <- sf::read_sf(file.path(datadir, "ca_fields.gpkg"))
+
+ca_fields <- ca_fields_full |>
+  dplyr::select(site_id, county, area_ha)  
+
+# Convert list to table with predictions and site identifier
+get_downscale_preds <- function(downscale_output_list) {
+  purrr::map(
+    downscale_output_list$predictions,
+    ~ tibble(site_id = covariates$site_id, prediction = .x)
+  ) |>
+    bind_rows(.id = "ensemble") |>
+    left_join(ca_fields, by = "site_id") 
+}
+
+downscale_preds <- purrr::map(downscale_output_list, get_downscale_preds) |>
+  dplyr::bind_rows(.id = "carbon_pool") |>
+  # Convert kg/m2 to Mg/ha using PEcAn.utils::ud_convert
+  mutate(c_density_Mg_ha = PEcAn.utils::ud_convert(prediction, "kg/m2", "Mg/ha")) |>
+  # Calculate total Mg per field: c_density_Mg_ha * area_ha
+  mutate(total_c_Mg = c_density_Mg_ha * area_ha)
+
+ens_county_preds <- downscale_preds |>
+  # Now aggregate to get county level totals for each pool x ensemble
+  group_by(carbon_pool, county, ensemble) |>
+  summarize(
+    n = n(),
+    total_c_Mg = sum(total_c_Mg),      # total Mg C per county
+    total_ha = sum(area_ha)
+  ) |>
+  ungroup() |>
+  mutate(
+    total_c_Tg = PEcAn.utils::ud_convert(total_c_Mg, "Mg", "Tg"),
+    mean_c_density_Mg_ha = total_c_Mg / total_ha
+  ) |>
+  arrange(carbon_pool, county, ensemble)
+
+# Check number of ensemble members per county/carbon_pool
+ens_county_preds |>
+  group_by(carbon_pool, county) |>
+  summarize(n_vals = n_distinct(total_c_Mg)) |>
+  pull(n_vals) |>
+  unique()
+
+
+county_summaries <- ens_county_preds |>
+    group_by(carbon_pool, county) |>
+    summarize(
+      n = max(n), # Number of fields in county should be same for each ensemble member
+      co_mean_c_total_Tg = mean(total_c_Tg),
+      co_sd_c_total_Tg = sd(total_c_Tg),
+      co_mean_c_density_Mg_ha = mean(mean_c_density_Mg_ha),
+      co_sd_c_density_Mg_ha = sd(mean_c_density_Mg_ha)
+    )
+
+readr::write_csv(
+  county_summaries,
+  file.path(outdir, "county_summaries.csv")
+)
+PEcAn.logger::logger.info("County summaries written to", file.path(outdir, "county_summaries.csv"))
+
+## Plot the results!
+PEcAn.logger::logger.info("Plotting County Level Summaries")
+county_boundaries <- st_read(here::here("data/counties.gpkg")) |>
+  filter(state_name == "California") |>
+  select(name)
+
+co_preds_to_plot <- county_summaries |>
+  right_join(county_boundaries, by = c("county" = "name")) |>
+  arrange(county, carbon_pool) |>
+  pivot_longer(
+    cols = c(mean_total_c_Tg, sd_total_c_Tg, mean_c_density_Mg_ha, sd_c_density_Mg_ha),
+    names_to = "stat",
+    values_to = "value"
+  ) |>
+  mutate(units = case_when(
+    str_detect(stat, "total_c") ~ "Carbon Stock (Tg)",
+    str_detect(stat, "c_density") ~ "Carbon Density (Mg/ha)"
+  ), stat = case_when(
+    str_detect(stat, "mean") ~ "Mean",
+    str_detect(stat, "sd") ~ "SD"
+  ))
+
+# now plot map of county-level predictions with total carbon
+units <- rep(unique(co_preds_to_plot$units), each = length(cpools))
+pool_x_units <- co_preds_to_plot |>
+  select(carbon_pool, units) |>
+  distinct() |>
+  # remove na
+  filter(!is.na(carbon_pool)) |> # why is one field in SF county NA?
+  arrange(carbon_pool, units)
+
+p <- purrr::map2(pool_x_units$carbon_pool, pool_x_units$units, function(pool, unit) {
+  .p <- ggplot(
+    co_preds_to_plot |> filter(carbon_pool == pool & units == unit),
+    aes(geometry = geom, fill = value)
+  ) +
+    geom_sf(data = county_boundaries, fill = "lightgrey", color = "black") +
+    geom_sf() +
+    scale_fill_viridis_c(option = "plasma") +
+    theme_minimal() +
+    facet_grid(carbon_pool ~ stat) +
+    labs(
+      title = paste("County-Level Predictions for", pool, unit),
+      fill = "Value"
+    )
+
+  unit <- ifelse(unit == "Carbon Stock (Tg)", "stock",
+    ifelse(unit == "Carbon Density (Mg/ha)", "density", NA)
+  )
+
+  plotfile <- here::here("downscale/figures", paste0("county_", pool, "_carbon_", unit, ".png"))
+  print(plotfile)
+  ggsave(
+    plot = .p,
+    filename = plotfile,
+    width = 10, height = 5,
+    bg = "white"
+  )
+  return(.p)
+})
+
+# Variable Importance and Partial Dependence Plots
+
+# First, calculate variable importance summary as before
+importance_summary <- map_dfr(cpools, function(cp) {
+  # Extract the importance for each ensemble model in the carbon pool
+  importances <- map(1:20, function(i) {
+    model <- downscale_output_list[[cp]][["model"]][[i]]
+    randomForest::importance(model)[, "%IncMSE"]
+  })
+
+  # Turn the list of importance vectors into a data frame
+  importance_df <- map_dfr(importances, ~ tibble(importance = .x), .id = "ensemble") |>
+    group_by(ensemble) |>
+    mutate(predictor = names(importances[[1]])) |>
+    ungroup()
+
+  # Now summarize median and IQR for each predictor across ensembles
+  summary_df <- importance_df |>
+    group_by(predictor) |>
+    summarize(
+      median_importance = median(importance, na.rm = TRUE),
+      lcl_importance = quantile(importance, 0.25, na.rm = TRUE),
+      ucl_importance = quantile(importance, 0.75, na.rm = TRUE)
+    ) |>
+    mutate(carbon_pool = cp)
+
+  summary_df
+})
+
+# TODO consider separating out plotting 
+####---Create checkpoint---####
+# system.time(save(downscale_output_list, importance_summary, covariates, cpools# these are ~500MB
+#   file = file.path(outdir, "checkpoint.RData"),
+#   compress = FALSE
+# ))
+# outdir <- "/projectnb/dietzelab/ccmmf/ccmmf_phase_1b_20250319064759_14859/output/out"
+# load(file.path(outdir, "checkpoint.RData"))
+#### ---End checkpoint---####
+
+covariate_names <- names(covariates |> select(where(is.numeric)))
+covariates_df <- as.data.frame(covariates) |>
+  # TODO pass scaled covariates from ensemble_downscale function
+  #    this will ensure that the scaling is the same.
+  mutate(covariates, across(all_of(covariate_names), scale))
+
+##### Subset data for plotting (speed + visible rug plots) #######
+subset_inputs <- TRUE
+# Subset data for testing / speed purposes
+if (subset_inputs) {
+  # cpools <- c("AGB")
+
+  # Subset covariates for testing (take only a small percentage of rows)
+  n_test_samples <- min(5000, nrow(covariates_df))
+  set.seed(123) # For reproducibility
+  test_indices <- sample(1:nrow(covariates_df), n_test_samples)
+  covariates_full <- covariates_df
+  covariates_df <- covariates_df[test_indices, ]
+
+  # For each model, subset the predictions to match the test indices
+  for (cp in cpools) {
+    if (length(downscale_output_list[[cp]]$predictions) > 0) {
+      downscale_output_list[[cp]]$predictions <-
+        lapply(downscale_output_list[[cp]]$predictions, function(x) x[test_indices])
+    }
+  }
+}
+
+##### End Subsetting Code#######
+
+##### Importance Plots #####
+
+
+for (cp in cpools) {
+  png(
+    filename = here::here("downscale/figures", paste0(cp, "_importance_partial_plots.png")),
+    width = 14, height = 6, units = "in", res = 300, bg = "white"
+  )
+
+  # Variable importance plot
+  cp_importance <- importance_summary |>
+    filter(carbon_pool == cp)
+  with(
+    cp_importance,
+    dotchart(median_importance,
+      labels = reorder(predictor, median_importance),
+      xlab = "Median Increase MSE (SD)",
+      main = paste("Importance -", cp),
+      pch = 19, col = "steelblue", cex = 1.2
+    )
+  )
+  with(
+    cp_importance,
+    segments(lcl_importance,
+      seq_along(predictor),
+      ucl_importance,
+      seq_along(predictor),
+      col = "gray50"
+    )
+  )
+  dev.off()
+}
+
+##### Importance and Partial Plots #####
+
+## Using pdp + ggplot2
+# # Loop over carbon pools
+# for (cp in cpools) {
+#   # Top 2 predictors for this carbon pool
+#   top_predictors <- importance_summary |>
+#     filter(carbon_pool == cp) |>
+#     arrange(desc(median_importance)) |>
+#     slice_head(n = 2) |>
+#     pull(predictor)
+
+#   # Retrieve model and covariate data
+#   model <- downscale_output_list[[cp]][["model"]][[1]]
+#   cov_df <- covariates_df # Already scaled
+
+#   ## 1. Create Variable Importance Plot with ggplot2
+#   cp_importance <- importance_summary |>
+#     filter(carbon_pool == cp)
+
+#   p_importance <- ggplot(cp_importance, aes(x = median_importance, y = reorder(predictor, median_importance))) +
+#     geom_point(color = "steelblue", size = 3) +
+#     geom_errorbarh(aes(xmin = lcl_importance, xmax = ucl_importance),
+#       height = 0.2,
+#       color = "gray50"
+#     ) +
+#     labs(
+#       title = paste("Importance -", cp),
+#       x = "Median Increase in MSE (SD)",
+#       y = ""
+#     ) +
+#     theme_minimal()
+
+#   ## 2. Create Partial Dependence Plot for the top predictor
+#   pd_data1 <- pdp::partial(
+#     object = model,
+#     pred.var = top_predictors[1],
+#     pred.data = cov_df,
+#     train = cov_df,
+#     plot = FALSE
+#   )
+#   ## Partial dependence for predictor 1
+#   p_partial1 <- ggplot(pd_data1, aes_string(x = top_predictors[1], y = "yhat")) +
+#     geom_line(color = "steelblue", size = 1.2) +
+#     geom_rug(
+#       data = cov_df, aes_string(x = top_predictors[1]),
+#       sides = "b", alpha = 0.5
+#     ) +
+#     labs(
+#       title = paste("Partial Dependence -", top_predictors[1]),
+#       x = top_predictors[1],
+#       y = paste("Predicted", cp)
+#     ) +
+#     theme_minimal()
+
+#   ## Partial dependence for predictor 2
+#   pd_data2 <- pdp::partial(
+#     object = model,
+#     pred.var = top_predictors[2],
+#     pred.data = cov_df,
+#     plot = TRUE,
+#     train = cov_df,
+#     parallel = TRUE
+#   )
+
+#   p_partial2 <- ggplot(pd_data2, aes_string(x = top_predictors[2], y = "yhat")) +
+#     geom_line(color = "steelblue", size = 1.2) +
+#     geom_rug(
+#       data = cov_df, aes_string(x = top_predictors[2]),
+#       sides = "b", alpha = 0.5
+#     ) +
+#     labs(
+#       title = paste("Partial Dependence -", top_predictors[2]),
+#       x = top_predictors[2],
+#       y = paste("Predicted", cp)
+#     ) +
+#     theme_minimal()
+
+#   combined_plot <- p_importance + p_partial1 + p_partial2 + plot_layout(ncol = 3)
+
+#   output_file <- here("downscale/figures", paste0(cp, "_importance_partial_plots.png"))
+#   ggsave(
+#     filename = output_file,
+#     plot = combined_plot,
+#     width = 14, height = 6, dpi = 300, bg = "white"
+#   )
+
+#   # also save pdp-generated plot 
+#   pdp_plots <- p_data1 + p_data2
+#   ggsave(pdp_plots,
+#     filename = here::here("downscale/figures", paste0(cp, "_PDP_", 
+#       top_predictors[1], "_", top_predictors[2], ".png")),
+#     width = 6, height = 4, dpi = 300, bg = "white"
+#   )
+# }
+
+## Using randomForest::partialPlot()
+# Combined importance + partial plots for each carbon pool
+
+
+# for (cp in cpools) {
+#   # Top 2 predictors for this carbon pool
+#   top_predictors <- importance_summary |>
+#     filter(carbon_pool == cp) |>
+#     arrange(desc(median_importance)) |>
+#     slice_head(n = 2) |>
+#     pull(predictor)
+
+#   # Prepare model and subset of covariates for plotting
+#   model <- downscale_output_list[[cp]][["model"]][[1]]
+#   cov_df <- covariates_df
+
+#   # Set up PNG for three panel plot
+#   png(
+#     filename = here::here("downscale/figures", paste0(cp, "_importance_partial_plots.png")),
+#     width = 14, height = 6, units = "in", res = 300, bg = "white"
+#   )
+#   par(mfrow = c(1, 3))
+
+#   # Panel 1: Variable importance plot
+#   cp_importance <- importance_summary |> filter(carbon_pool == cp)
+#   par(mar = c(5, 10, 4, 2))
+#   with(
+#     cp_importance,
+#     dotchart(median_importance,
+#       labels = reorder(predictor, median_importance),
+#       xlab = "Median Increase MSE (SD)",
+#       main = paste("Importance -", cp),
+#       pch = 19, col = "steelblue", cex = 1.2
+#     )
+#   )
+#   with(
+#     cp_importance,
+#     segments(lcl_importance,
+#       seq_along(predictor),
+#       ucl_importance,
+#       seq_along(predictor),
+#       col = "gray50"
+#     )
+#   )
+
+#   # Panel 2: Partial plot for top predictor
+#   par(mar = c(5, 5, 4, 2))
+#   randomForest::partialPlot(model,
+#     pred.data = cov_df,
+#     x.var = top_predictors[1],
+#     main = paste("Partial Dependence -", top_predictors[1]),
+#     xlab = top_predictors[1],
+#     ylab = paste("Predicted", cp),
+#     col = "steelblue",
+#     lwd = 2
+#   )
+
+#   # Panel 3: Partial plot for second predictor
+#   randomForest::partialPlot(model,
+#     pred.data = cov_df,
+#     x.var = top_predictors[2],
+#     main = paste("Partial Dependence -", top_predictors[2]),
+#     xlab = top_predictors[2],
+#     ylab = paste("Predicted", cp),
+#     col = "steelblue",
+#     lwd = 2
+#   )
+#   dev.off() # Save combined figure
+# }
diff --git a/downscale/04_downscaling_documentation_results.qmd b/downscale/04_downscaling_documentation_results.qmd
new file mode 100644
index 0000000..245057d
--- /dev/null
+++ b/downscale/04_downscaling_documentation_results.qmd
@@ -0,0 +1,326 @@
+---
+title: "Downscaling Workflow Documentation"
+author: "David LeBauer"
+date: "`r Sys.Date()`"
+format:
+  html:
+    self-contained: true
+    embed-resources: true
+    df-print: paged
+    toc: true
+execute:
+  echo: false
+---
+
+<!--
+quarto render downscale/04_downscaling_documentation_results.qmd --to html
+-->
+
+# Overview
+
+The downscaling workflow is used to predict carbon pools (Soil Organic Carbon and Aboveground Biomass) for cropland fields in California and then aggregate these predictions to the county scale.
+
+It uses an ensemble based approach to uncertainty propagation and analysis.
+
+This multi-step approach facilitates uncertainty quantification by maintaining ensemble structure to propagate errors through the prediction and aggregation processes.
+
+**Definitions**
+
+- **Design Points**: Fields chosen via stratified random sampling - using k-means clustering on environmental data layers - across all of California crop fields.
+- **Crop Fields**: These are all of the croplands in the LandIQ dataset.
+- **Anchor Sites:** These are sites that will be used as ground truth for calibration and validation. These include UC research stations and Ameriflux sites with high quality data.
+
+
+**Steps**
+
+* **Data Preparation**: Prepares data for clustering and downscaling process.
+* **Design Point Selection**: Uses k-means clustering to select a representative set of fields that represent environmental space, and add these to the anchor sites.
+* **SIPNET Model Runs**: A separate workflow prepares inputs and runs SIPNET simulations for the design points.
+* **Extract SIPNET Output**: Extracts ensemble SIPNET outputs for the design points and converts them into NetCDF, array, and long formats.
+* **Downscale and Aggregate SIPNET Output**: Builds a Random Forest model for each ensemble member to predict SOC and AGB, downscales the predictions to the full set of fields, and aggregates predictions to county-level estimates to produce maps and summary statistics.
+
+## Data Preparation
+
+```sh
+Rscript downscale/00_prepare.R
+```
+
+This script prepares data for clustering and downscaling process.
+
+It reads in the LandIQ crop map, anchor sites, and environmental covariates, and creates a table of environmental covariates for each field in the LandIQ crop map. It also links a table of anchor sites to their corresponding LandIQ fields so that these can be used in downstream analyses.
+
+- Converts LandIQ-derived shapefiles to a geopackage with geospatial information and a CSV with other attributes from the LandIQ dataset.
+- Extracts environmental covariates (clay, organic carbon, topographic wetness, temperature, precipitation, solar radiation, vapor pressure) for each field in the LandIQ dataset.
+- Groups fields into Cal-Adapt climate regions.
+- Assigns anchor sites to fields.
+
+
+**Inputs:**
+
+- **LandIQ Crop Map**
+  - `data_raw/i15_Crop_Mapping_2016_SHP/i15_Crop_Mapping_2016.shp`. This is a manually curated and harmonized version of the 2016 LandIQ crop map for California.
+- **Soilgrids**
+  - `clay_0-5cm_mean.tif` and `ocd_0-5cm_mean.tif` are rasters that have been downloaded and prepared for efficient data extraction.
+- **Topographic Wetness Index (TWI)**
+  - `TWI/TWI_resample.tiff`
+- **ERA5 Climatological Means**
+  - Available for years 2012-2024 in `GridMET/` folder in files named `ERA5_met_<YYYY>.tiff`. Currently only using means of each variable across all years. Variables include:
+    - temperature
+    - precipitation
+    - solar radiation
+    - vapor pressure deficit
+- **Anchor Sites:**
+  - `data_raw/anchor_sites.csv`: anchor sites (sites with validation data)
+  - external_site_id: Ameriflux site ID or other unique ID, not to be confused with `site_id`.
+  - lat, lon, crops, pft.
+
+Future design point selection factors will include management practices (crop, cropping system (cycle, mixtures), irrigation, tillage, C&N ammendments).
+
+**Outputs:** 
+
+- `ca_fields.gpkg` contains spatial information from LandIQ including: `site_id`, `lat`, `lon`, `area` (ha), `county`, and `geom`. Lat and Lon are centroids of the geom field.
+- `ca_field_attributes.csv` contains site_id, lat, lon, year, crop, pft, source, and notes. The crop and pft associated with each field may differ from those in anchor sites, because they come from different sources.
+- `site_covariates.csv` is a table of environmental covariates for each field in the LandIQ crop map. 
+  - site_id, temp, precip, srad, vapr, crop, pft, clay, ocd, twi.
+- `anchor_sites_ids.csv`.
+  - site_id, lat, lon, external_site_id, site_name, crops, pft.
+  - (note that lat, lon are the centroid of the field, not the original values in `data_raw/anchor_sites.csv`).
+
+Below is a map of the Anchor Sites and Climate Regions of California.
+
+![Anchor Sites](figures/anchor_sites.png)
+
+## Design Point Selection
+
+```sh
+Rscript downscale/01_cluster_and_select_design_points.R
+```
+
+Use k-means clustering to select a representative set of 75 fields that represent environmental space, and add these to the 23 distinct[^1^] anchor sites.
+
+[^1^]: There are 25 anchor sites but two have duplicate lat / lon coordinates. This will be addressed in future iterations of the analysis.
+
+These are the sites where the SIPNET crop and biogeochemistry model will be run. Currently we are running SIPNET for 98 total sites, which includes 75 design points and 23 anchor sites. 
+
+**Steps**
+
+- Subsample LandIQ fields and include anchor sites for clustering.
+- Select cluster number based on the Elbow Method.
+  - Plot within-cluster sum of squares (WCSS) against the number of clusters and identify the point where adding more clusters yields diminishing returns.
+  - Also evaluates silhouette scores; and future iterations may use multiple methods to select cluster number.
+- Cluster fields using k-means clustering to cluster the fields based on environmental covariates.
+- Select design points from clusters for SIPNET simulation.
+
+**Inputs:**
+
+- `data/site_covariates.csv`
+- `data/anchor_sites_ids.csv`
+
+**Output:** 
+
+- `data/design_points.csv`.
+
+
+**Results:**
+
+**A map of design points.** Showing their geographic distribution across California and relative to croplands. Grey areas are the LandIQ fields, and the boundaries are CalAdapt Climate Zones.
+
+![Clustered Design Points](figures/design_points.png)
+
+
+The next two plots show the show the environmental characteristics of clusters - what makes them different, to help assess the clustering process.
+
+**Second is a pairs plot of the environmental covariates.** This plot shows the relationships between the covariates used for clustering, and colors indicate cluster membership.
+
+![Clustered Pairs Plot](figures/cluster_pairs.png)
+
+**Third is a summary of the normalized mean values of environmental covariates by cluster.** This plot illustrates the environmental characteristics of the clusters.
+
+![Cluster Summary](figures/cluster_plot.png)
+
+## SIPNET Model Runs
+
+These are produced by the Modeling Workflow
+[Link to Modeling Workflow]
+
+**Steps:**
+
+- Prepare inputs and run SIPNET simulations for the design points.
+
+**Inputs**
+
+- `design_points.csv`
+- Initial Conditions (described in modeling workflow)
+
+**Outputs:**
+
+- `out/ENS-<ensemble_number>-<site_id>/YYYY.nc` 
+  - These are NetCDF files that contain the SIPNET outputs for each site in a standardized format.
+  - Currently, <ensemble_number> is a value from 1 to 20 ensemble members, and there are 98 values of <site_id> that identify the design points. For final runs, these numbers may be closer to 100 ensemble members and 10k design points.
+
+## Extract SIPNET Output
+
+```sh
+Rscript downscale/02_extract_sipnet_output.R
+```
+
+This step extracts ensemble SIPNET outputs for the design points and converts them into NetCDF, array, and long formats. 
+
+**Steps:**
+
+- Extract output variables (AGB, TotSoilCarb) from SIPNET simulations
+- Aggregate site-level ensemble outputs into long and 4D array formats.
+- Save CSV and NetCDF files for downstream use in formats that follow EFI standards for forecasts (Dietze et. al. 2023).
+
+**Inputs:**
+
+- `out/ENS-<ensemble_number>-<site_id>/YYYY.nc`
+
+**Outputs:**
+
+- `out/efi_ens_long.csv`: long format.
+- `out/efi_forecast.nc`: arrays in NetCDF format
+
+
+### Downscale and Aggregate SIPNET Output
+
+```sh
+Rscript downscale/03_downscale_and_aggregate.R
+```
+
+Builds a Random Forest model for each ensemble member of each output to predict SIPNET SOC and AGB.
+
+**Steps:**
+
+- Train models on SIPNET ensemble runs at design point in order to predict for all California fields.
+- Use environmental covariates extracted in the data preparation step to downscale the predictions to the full set of fields, including all woody crop fields.
+- Aggregate predictions to county-level estimates to produce maps and summary statistics.
+- Output maps and statistics of carbon density and totals.
+
+**Inputs:**
+
+- `out/efi_ens_long.csv`: long format SIPNET outputs for ensemble runs at design points.
+- `data/site_covariates.csv`: environmental covariates for each field in the LandIQ crop map.
+
+**Outputs:** 
+
+- County-level statistics for each variable as tables and maps.
+- `out/county_total_AGB.png`: county-level AGB predictions.
+- `out/county_total_TotSoilCarb.png`: county-level SOC predictions.
+- `out/county_summaries.csv`: summary statistics for each county.
+
+## Results
+
+### County-Level Carbon Stock and Density Maps
+
+The following maps illustrate the spatial variation and uncertainty (mean and standard deviation) of the predicted carbon pools at the county level. 
+
+#### County Carbon Stock for TotSoilCarb
+
+![](figures/county_TotSoilCarb_carbon_stock.png)
+
+#### County Carbon Density for TotSoilCarb
+
+![](figures/county_TotSoilCarb_carbon_density.png)
+
+#### County Carbon Stock for AGB
+
+![](figures/county_AGB_carbon_stock.png)
+
+#### County Carbon Density for AGB
+
+![](figures/county_AGB_carbon_density.png)
+
+### Variable Importance and Partial Dependence
+
+The following plots show the variable importance from the random forest models used for downscaling, along with partial dependence plots for the top two predictors.
+
+Variable importance quantifies how useful each covariate is in predicting the carbon stock<!-- or ghg flux-->. Partial dependence plots show the marginal effect of individual predictors on model response after averaging over the other predictors.
+
+![](figures/importance.png)
+
+<!--
+#### Variable Importance and Partial Dependence
+
+![](figures/TotSoilCarb_importance_partial_plots.png)
+
+#### AGB - Importance and Partial Dependence
+
+![](figures/AGB_importance_partial_plots.png)
+
+--->
+### Searchable Table
+
+The table below provides a searchable summary of the county-level carbon stocks and densities.
+
+```{r}
+
+outdir <- "/projectnb/dietzelab/ccmmf/ccmmf_phase_1b_20250319064759_14859/output/out"
+# Load county summaries data
+county_summaries <- readr::read_csv(file.path(outdir, "county_summaries.csv"),
+  show_col_types = FALSE)
+#colnames(county_summaries)
+# Combine mean and SD into a single column for carbon density
+county_summaries_table <- county_summaries |> 
+  dplyr::mutate(
+    `Mean Total C (Tg/county)` = paste0(
+       signif(co_mean_c_total_Tg, 2), 
+       " (", signif(co_sd_c_total_Tg, 2), ")"
+    ),
+    `Mean C Density (Mg/ha)` = paste0(
+       signif(co_mean_c_density_Mg_ha, 2), 
+       " (", signif(co_sd_c_density_Mg_ha, 2), ")"
+    )
+     ) |> 
+     dplyr::rename(
+    `Carbon Pool` = carbon_pool,
+    `County` = county,
+    `# Fields` = n
+     ) |> 
+     dplyr::select(`Carbon Pool`, `County`, `# Fields`, `Mean Total C (Tg/county)`, `Mean C Density (Mg/ha)`)
+
+# Create Table
+# TODO 
+#  - Fix point w/ missing county
+
+htmlwidgets::setWidgetIdSeed(123) # required to embed table self-contained in html
+options(htmlwidgets.TEMP_DIR = "htmlwidgets")
+
+DT::datatable(
+   county_summaries_table,
+   options = list(
+      pageLength = 10,
+      searchHighlight = TRUE
+   ),
+   rownames = FALSE,
+   escape = FALSE
+)
+```
+
+# References
+
+**EFI Standards**
+
+Dietze, Michael C., R. Quinn Thomas, Jody Peters, Carl Boettiger, Gerbrand Koren, Alexey N. Shiklomanov, and Jaime Ashander. 2023. “A Community Convention for Ecological Forecasting: Output Files and Metadata Version 1.0.” Ecosphere 14 (11): e4686. https://doi.org/10.1002/ecs2.4686.
+
+**LandIQ Crop Map**
+
+Land IQ, LLC. California Crop Mapping (2014). California Department of Water Resources, 2017. https://data.cnra.ca.gov/dataset/statewide-crop-mapping.
+
+**SoilGrids250m**
+
+Hengl, T. et al. 2017. “SoilGrids250m: Global Gridded Soil Information Based on Machine Learning.” PLoS ONE 12(2): e0169748. https://doi.org/10.1371/journal.pone.0169748
+
+**ERA5 Climate Data**
+
+Hersbach, H. et al. 2020. “The ERA5 Global Reanalysis.” Quarterly Journal of the Royal Meteorological Society 146: 1999–2049. https://doi.org/10.1002/qj.3803
+
+**SIPNET**
+
+Braswell, Bobby H., William J. Sacks, Ernst Linder, and David S. Schimel. 2005. “Estimating Diurnal to Annual Ecosystem Parameters by Synthesis of a Carbon Flux Model with Eddy Covariance Net Ecosystem Exchange Observations.” Global Change Biology 11 (2): 335–55. https://doi.org/10.1111/j.1365-2486.2005.00897.x.
+
+Sacks, William J., David S. Schimel, Russell K. Monson, and Bobby H. Braswell. 2006. “Model‐data Synthesis of Diurnal and Seasonal CO2 Fluxes at Niwot Ridge, Colorado.” Global Change Biology 12 (2): 240–59. https://doi.org/10.1111/j.1365-2486.2005.01059.x.
+
+**Random Forest**
+
+Liaw, Andy, and Matthew Wiener. 2002. “Classification and Regression by randomForest.” R News 2 (3): 18–22. https://CRAN.R-project.org/doc/Rnews/.
\ No newline at end of file