From 4bcac1c1e85200f64fa41b9dbf0c55e4c819e482 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 16 Apr 2026 13:01:47 +0000 Subject: [PATCH 01/15] Add mlr3filters TODO to trans_preds_t Agent-Logs-Url: https://github.com/ethzplus/evoland-plus/sessions/2eb1143b-0b8d-42f2-ad27-052b4100baea Co-authored-by: mmyrte <24587121+mmyrte@users.noreply.github.com> --- R/trans_preds_t.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/R/trans_preds_t.R b/R/trans_preds_t.R index 821e527..7c6b4fe 100644 --- a/R/trans_preds_t.R +++ b/R/trans_preds_t.R @@ -169,6 +169,13 @@ prune_trans_worker <- function(item, db, filter_fun, ordered_pred_data = FALSE, ) } +# TODO: mlr3filters (https://mlr3filters.mlr-org.com/) provides a range of filter methods +# (mutual information, permutation importance, correlation-based, etc.) that could replace or +# supplement the current covariance_filter approach here. This refactoring should follow the +# same patterns as trans_models_t: accept an mlr3 Filter object (or id string + params) in +# place of filter_fun, store the filter id and parameters as DuckDB-native MAP columns, and +# serialize the filter object as a BLOB for full reproducibility. + #' @describeIn trans_preds_t Get a pruned set of transition-predictor relationships #' based on a filtering function #' @param filter_fun A function that takes a transition-predictor data (cf. [trans_pred_data_v]) and From b649c94b40c8ab8ca0cd5908980fa5b9c17d36cb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 16 Apr 2026 13:36:27 +0000 Subject: [PATCH 02/15] Refactor trans_models_t to use mlr3 interface Agent-Logs-Url: https://github.com/ethzplus/evoland-plus/sessions/16192889-e909-4511-bc6c-4fc80926b365 Co-authored-by: mmyrte <24587121+mmyrte@users.noreply.github.com> --- DESCRIPTION | 2 + R/evoland_db.R | 29 +- R/trans_models_t.R | 386 +++++++++++++--------- R/trans_pot_t.R | 37 +-- inst/tinytest/test_integ_allocation.R | 18 +- inst/tinytest/test_integ_trans_models_t.R | 281 ++++++---------- 6 files changed, 388 insertions(+), 365 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 322f390..fac40ea 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -26,6 +26,8 @@ Imports: Suggests: base64enc, butcher, + mlr3, + mlr3viz, pROC, processx, quarto, diff --git a/R/evoland_db.R b/R/evoland_db.R index a29edb7..ba0064d 100644 --- a/R/evoland_db.R +++ b/R/evoland_db.R @@ -173,12 +173,15 @@ evoland_db <- R6::R6Class( #' @description #' Fit full models on complete data using the best partial model configuration for #' each transition, see [fit_full_models()] - #' @param partial_models A trans_models_t table with partial models (see [fit_partial_models()]) - #' @param gof_criterion Which goodness-of-fit metric to use for model selection (e.g., "auc") + #' @param learner An mlr3 `Learner` or `AutoTuner` object; used as last-resort + #' fallback for reconstruction. + #' @param measures A list of mlr3 `Measure` objects; kept for API consistency. + #' @param gof_criterion Which cross-validation measure to use for model selection (e.g., `"classif.auc"`) #' @param gof_maximize Maximize (TRUE) or minimize (FALSE) the gof_criterion? #' @param cluster Optional cluster object for parallel processing fit_full_models = function( - partial_models, + learner, + measures, gof_criterion, gof_maximize, cluster = NULL @@ -189,23 +192,29 @@ evoland_db <- R6::R6Class( #' @description Fit partial models for each viable transition using stratified #' sampling. Models are trained on a subsample and evaluated on held-out data, see #' [fit_partial_models()] for details. - #' @param fit_fun Function for generating a model object. - #' @param gof_fun Function to evaluate goodness of fit. + #' @param learner An mlr3 `Learner` or `AutoTuner` R6 object. + #' @param measures A list of mlr3 `Measure` objects for scoring the held-out split. #' @param sample_frac Fraction in \(0, 1\) for stratified sampling. #' @param seed Random seed for reproducible sampling #' @param cluster Optional cluster object for parallel processing - #' @param ... additional arguments passed to fit_fun fit_partial_models = function( - fit_fun, + learner, + measures, sample_frac = 0.7, - gof_fun, seed = NULL, - cluster = NULL, - ... + cluster = NULL ) { create_method_binding(fit_partial_models) }, + #' @description + #' Get cross-validation plots for stored predictions, see [get_crossval_plots()] + #' @param id_run Optional integer; filter by run ID. + #' @param id_trans Optional integer; filter by transition ID. + get_crossval_plots = function(id_run = NULL, id_trans = NULL) { + create_method_binding(get_crossval_plots) + }, + #' @description #' Set an initial full set of transition / predictor relations, see [set_full_trans_preds()] #' @param overwrite Logical, whether to overwrite existing `trans_preds_t` table. Default FALSE. diff --git a/R/trans_models_t.R b/R/trans_models_t.R index 47bcd73..cdf6adf 100644 --- a/R/trans_models_t.R +++ b/R/trans_models_t.R @@ -2,7 +2,7 @@ #' #' Creates a trans_models_t table for storing transition model metadata and #' serialized model objects. This function creates an empty table with proper -#' structure for storing fitted models. +#' structure for storing fitted models via the mlr3 interface. #' #' @name trans_models_t #' @@ -11,24 +11,29 @@ #' @return A data.table of class "trans_models_t" with columns: #' - `id_run`: Foreign key to runs_t #' - `id_trans`: Foreign key to trans_meta_t -#' - `model_family`: Model family (e.g., "rf", "glm", "bayesian") -#' - `model_params`: Map of model (hyper) parameters -#' - `goodness_of_fit`: Map of various measures of fit (e.g., ROC AUC, RMSE) -#' - `fit_call`: Character string of the original fit function call for reproducibility -#' - `model_obj_part`: BLOB of serialized model object for validation -#' - `model_obj_full`: BLOB of serialized model object for extrapolation +#' - `learner_id`: mlr3 learner key, e.g. `"classif.ranger"` +#' - `learner_params`: MAP of atomic scalar learner hyperparameters for +#' querying; complete hyperparameters are captured by `learner_spec` +#' - `learner_spec`: BLOB of serialized untrained mlr3 `Learner`; for +#' AutoTuners, this is the optimal inner learner after tuning +#' - `crossval_measures`: MAP of cross-validation performance scores +#' (from `prediction$score(measures)`) +#' - `crossval_predictions`: BLOB of serialized mlr3 `PredictionClassif` +#' on the held-out test split +#' - `learner_full`: BLOB of serialized trained mlr3 `Learner` fitted on +#' the full dataset, used for extrapolation #' @export as_trans_models_t <- function(x) { if (missing(x)) { x <- data.table::data.table( id_run = integer(0), id_trans = integer(0), - model_family = character(0), - model_params = list(), - goodness_of_fit = list(), - fit_call = character(0), - model_obj_part = list(), - model_obj_full = list() + learner_id = character(0), + learner_params = list(), + learner_spec = list(), + crossval_measures = list(), + crossval_predictions = list(), + learner_full = list() ) } @@ -39,8 +44,8 @@ as_trans_models_t <- function(x) { as_parquet_db_t( x, "trans_models_t", - key_cols = c("id_run", "id_trans", "fit_call"), - map_cols = c("model_params", "goodness_of_fit"), + key_cols = c("id_run", "id_trans", "learner_id"), + map_cols = c("learner_params", "crossval_measures"), partition_cols = "id_run" ) } @@ -50,27 +55,19 @@ as_trans_models_t <- function(x) { fit_partial_model_worker <- function( item, db, - fit_fun, - gof_fun, + learner, + measures, seed = NULL, - sample_frac = 0.7, - ... + sample_frac = 0.7 ) { + if (!requireNamespace("mlr3", quietly = TRUE)) { + stop("Package 'mlr3' is required. Install with: install.packages('mlr3')") + } + id_run_orig <- db$id_run on.exit(db$id_run <- id_run_orig, add = TRUE) db$id_run <- item[["id_run"]] - # We modify the fit_fun by attaching the fit_fun_args to its formals. This allows - # us to deparse it so as to store a string representation. When calling the - # function object - possibly reconstructed using str2lang - only the data argument - # should change (subsampled/partial or full) - formals(fit_fun) <- c(formals(fit_fun), list(...)) - - # Deparse to character string for storage - fit_call_str <- - deparse(fit_fun, width.cutoff = 500L) |> - paste(collapse = "\n ") - tryCatch( { # Fetch ALL data into memory @@ -95,12 +92,10 @@ fit_partial_model_worker <- function( )) } - # Stratified sampling - # Split by did_transition (TRUE/FALSE) + # Stratified sampling by did_transition idx_true <- which(trans_pred_data_full[["did_transition"]]) idx_false <- which(!trans_pred_data_full[["did_transition"]]) - # Sample from each group n_train_true <- ceiling(length(idx_true) * sample_frac) n_train_false <- ceiling(length(idx_false) * sample_frac) @@ -113,58 +108,70 @@ fit_partial_model_worker <- function( sample(idx_false, n_train_false) ) - train_data <- trans_pred_data_full[train_idx] - test_data <- trans_pred_data_full[!train_idx] - - # actually evaluate the fit_fun - model <- fit_fun(data = train_data) + # Subset to task columns (did_transition + predictors) and coerce target + task_cols <- c("did_transition", pred_cols) + train_data <- trans_pred_data_full[train_idx, .SD, .SDcols = task_cols] + test_data <- trans_pred_data_full[-train_idx, .SD, .SDcols = task_cols] - # Evaluate on test data - goodness_of_fit <- gof_fun(model = model, test_data = test_data) + train_data[, did_transition := factor(did_transition, levels = c("FALSE", "TRUE"))] + test_data[, did_transition := factor(did_transition, levels = c("FALSE", "TRUE"))] - # Extract model family - model_family <- if (!is.null(attr(model, "family"))) { - as.character(attr(model, "family")) - } else if (inherits(model, "glm")) { - paste0("glm_", model$family$family) - } else { - class(model)[1] - } - - # Extract model params for subsetting - model_params <- list( - n_predictors = length(pred_cols), - n_train = nrow(train_data), - sample_frac = sample_frac, - ... + # Build mlr3 task and train a fresh clone of the learner + train_task <- mlr3::as_task_classif( + train_data, + target = "did_transition", + positive = "TRUE" + ) + trained_learner <- learner$clone(deep = TRUE) + trained_learner$train(train_task) + + # Predict on test data; test_data includes did_transition as truth + prediction <- trained_learner$predict_newdata(test_data) + + # Score with supplied measures + scores <- as.list(prediction$score(measures)) + + # For AutoTuner: extract optimal inner learner; otherwise use trained learner + extract_from <- + if (inherits(trained_learner, "AutoTuner") && !is.null(trained_learner$learner$model)) { + trained_learner$learner + } else { + trained_learner + } + + l_id <- extract_from$id + l_params <- Filter( + function(v) is.atomic(v) && length(v) == 1L, + extract_from$param_set$values ) + l_params <- if (length(l_params) == 0L) NULL else l_params + l_spec <- qs2::qs_serialize(extract_from$clone(deep = TRUE)$reset()) - # Create result row data.table::data.table( id_run = item[["id_run"]], id_trans = item[["id_trans"]], - model_family = model_family, - model_params = list(model_params), - goodness_of_fit = list(goodness_of_fit), - fit_call = fit_call_str, - model_obj_part = list(qs2::qs_serialize(model)), - model_obj_full = list(NULL) + learner_id = l_id, + learner_params = list(l_params), + learner_spec = list(l_spec), + crossval_measures = list(scores), + crossval_predictions = list(qs2::qs_serialize(prediction)), + learner_full = list(NULL) ) }, error = function(e) { warning(glue::glue( "Error fitting model for transition {item[['id_trans']]}: {e$message}" )) - return(data.table::data.table( + data.table::data.table( id_run = item[["id_run"]], id_trans = item[["id_trans"]], - model_family = "error", - model_params = list(NULL), - goodness_of_fit = list(list(failed = TRUE, message = e$message)), - fit_call = fit_call_str, - model_obj_part = list(NULL), - model_obj_full = list(NULL) - )) + learner_id = "error", + learner_params = list(NULL), + learner_spec = list(NULL), + crossval_measures = list(NULL), + crossval_predictions = list(NULL), + learner_full = list(NULL) + ) } ) } @@ -172,6 +179,10 @@ fit_partial_model_worker <- function( # Worker function for full model fitting # Not exported; used internally by fit_full_models fit_full_model_worker <- function(item, db, ...) { + if (!requireNamespace("mlr3", quietly = TRUE)) { + stop("Package 'mlr3' is required. Install with: install.packages('mlr3')") + } + tryCatch( { # Fetch full data @@ -186,52 +197,65 @@ fit_full_model_worker <- function(item, db, ...) { )) } - # Retrieve the fit call from best partial model (as character string) - fit_call_str <- item[["fit_call"]] + pred_cols <- grep("^id_pred_", names(trans_pred_data_full), value = TRUE) + task_cols <- c("did_transition", pred_cols) + task_data <- trans_pred_data_full[, .SD, .SDcols = task_cols] + task_data[, did_transition := factor(did_transition, levels = c("FALSE", "TRUE"))] + + full_task <- mlr3::as_task_classif( + task_data, + target = "did_transition", + positive = "TRUE" + ) - # Check that fit_call exists - if (is.na(fit_call_str) || fit_call_str == "") { - stop(glue::glue( - "fit_call not found for transition {item[['id_trans']]}" - )) - } + # Reconstruct learner: try learner_spec first, fall back to do.call(lrn, ...) + learner_spec_raw <- item[["learner_spec"]][[1L]] + l_id <- item[["learner_id"]] + l_params <- item[["learner_params"]][[1L]] + + trained_learner <- tryCatch( + qs2::qs_deserialize(learner_spec_raw), + error = function(e) { + warning(glue::glue( + "learner_spec deserialization failed for {l_id}: {e$message}; ", + "falling back to do.call reconstruction" + )) + do.call(mlr3::lrn, c(list(l_id), as.list(l_params))) + } + ) - # Parse the character string to call object, reconstruct function, call - fit_fun <- eval(str2lang(fit_call_str)) - model_full <- fit_fun(data = trans_pred_data_full) + trained_learner$train(full_task) - # Create result row - copy from partial model but update model_obj_full list( id_run = item[["id_run"]], id_trans = item[["id_trans"]], - fit_call = fit_call_str, - model_obj_full = list(qs2::qs_serialize(model_full)) + learner_id = l_id, + learner_full = list(qs2::qs_serialize(trained_learner)) ) }, error = function(e) { warning(glue::glue( "Error fitting full model for transition {item[['id_trans']]}: {e$message}" )) - return(list( + list( id_run = item[["id_run"]], id_trans = item[["id_trans"]], - fit_call = item[["fit_call"]], - model_obj_full = list(NULL) - )) + learner_id = item[["learner_id"]], + learner_full = list(NULL) + ) } ) } -#' @describeIn trans_models_t Fit partial models for each viable transition and store -#' results in a trans_models_t table. -#' @param self, [evoland_db] instance to query for transitions and predictor data -#' @param fit_fun Function that takes a data.frame with predictors and did_transition columns -#' and returns a fitted model object. The data argument is passed as the first argument -#' to the function, and additional arguments can be passed via ... -#' @param gof_fun Function that takes a fitted model object and a test data.frame and -#' returns a list of goodness-of-fit metrics. The model argument is passed as the first -#' argument and the test_data argument is passed as the second argument. +#' @describeIn trans_models_t Fit partial (cross-validation) models for each viable +#' transition and store results in a trans_models_t table. +#' @param self [evoland_db] instance to query for transitions and predictor data +#' @param learner An mlr3 `Learner` or `AutoTuner` object. A deep clone is trained +#' for each transition; the original object is not modified. For `AutoTuner`, +#' the optimal inner learner is extracted after tuning. +#' @param measures A list of mlr3 `Measure` objects (e.g. `list(mlr3::msr("classif.auc"))`) +#' used to score the held-out predictions; results are written to `crossval_measures`. #' @param sample_frac Numeric between 0 and 1 indicating #' the fraction of data to use for training the partial models. The rest is used for #' testing and calculating goodness-of-fit metrics. Default is 0.7 (70% training, 30% @@ -241,12 +265,11 @@ fit_full_model_worker <- function(item, db, ...) { #' [mirai::make_cluster()]. fit_partial_models <- function( self, - fit_fun, - gof_fun, + learner, + measures, sample_frac = 0.7, seed = NULL, - cluster = NULL, - ... + cluster = NULL ) { trans_preds_nested <- data.table::as.data.table(self$trans_preds_t)[, @@ -265,8 +288,8 @@ fit_partial_models <- function( stopifnot( "No viable transitions" = nrow(viable_trans) > 0L, - "fit_fun must be a function" = is.function(fit_fun), - "gof_fun must be a function" = is.function(gof_fun), + "learner must be an mlr3 Learner or AutoTuner" = inherits(learner, "Learner"), + "measures must be a non-empty list" = is.list(measures) && length(measures) > 0L, "sample_frac must be between 0 and 1" = sample_frac > 0 && sample_frac < 1 ) @@ -282,30 +305,33 @@ fit_partial_models <- function( worker_fun = fit_partial_model_worker, parent_db = self, cluster = cluster, - fit_fun = fit_fun, - gof_fun = gof_fun, + learner = learner, + measures = measures, seed = seed, - sample_frac = sample_frac, - ... + sample_frac = sample_frac ) |> data.table::rbindlist() |> as_trans_models_t() } #' @describeIn trans_models_t Fit full models for each transition based on the best -#' partial model according to a specified goodness-of-fit criterion. -#' @param self, [evoland_db] instance to query for transitions and predictor data -#' @param partial_models A trans_models_t table containing the fitted partial models and -#' their goodness-of-fit metrics. -#' @param gof_criterion Character string specifying which goodness-of-fit metric to use for -#' selecting the best partial model for each transition (e.g., "roc_auc", "rmse"). -#' @param gof_maximize Logical indicating whether to select the model with the maximum -#' (TRUE) or minimum (FALSE) value of the specified goodness-of-fit criterion. Default -#' is TRUE. +#' partial model according to a specified cross-validation criterion. +#' @param self [evoland_db] instance to query for transitions and predictor data +#' @param learner An mlr3 `Learner` or `AutoTuner` object; kept for API consistency and +#' used as a last-resort fallback if both `learner_spec` deserialization and +#' `do.call(mlr3::lrn, ...)` reconstruction fail. +#' @param measures A list of mlr3 `Measure` objects; kept for API consistency. +#' @param gof_criterion Character string specifying which cross-validation measure to use +#' for selecting the best partial model per transition (must match a key in +#' `crossval_measures`, e.g. `"classif.auc"`). +#' @param gof_maximize Logical; select the model with the maximum (`TRUE`) or minimum +#' (`FALSE`) value of `gof_criterion`. Default is `TRUE`. #' @param cluster An optional cluster object created by [parallel::makeCluster()] or #' [mirai::make_cluster()]. fit_full_models <- function( self, + learner, + measures, gof_criterion, gof_maximize, cluster = NULL @@ -313,11 +339,12 @@ fit_full_models <- function( stopifnot( "gof_criterion must be a character string" = is.character(gof_criterion) && length(gof_criterion) == 1L, - "gof_maximize must be a set to TRUE or FALSE" = (gof_maximize || !gof_maximize), + "gof_maximize must be set to TRUE or FALSE" = isTRUE(gof_maximize) || isFALSE(gof_maximize), "trans_models_t is missing" = file.exists(self$get_table_path("trans_models_t")) ) - best_models <- self$get_query(glue::glue( + # Get the best partial model per transition (scalar columns only; MAP/BLOB via fetch below) + best_model_ids <- self$get_query(glue::glue( r"[ with preds_nested as ( select @@ -332,7 +359,7 @@ fit_full_models <- function( select tm.id_run, tm.id_trans, - tm.fit_call, + tm.learner_id, pn.id_pred, from {self$get_read_expr("trans_models_t")} tm, @@ -342,11 +369,27 @@ fit_full_models <- function( and pn.id_trans = tm.id_trans qualify row_number() over ( partition by tm.id_run, tm.id_trans - order by tm.goodness_of_fit['{gof_criterion}'] {ifelse(gof_maximize, "desc", "asc")} + order by tm.crossval_measures['{gof_criterion}'] {ifelse(gof_maximize, "desc", "asc")} ) = 1; ]" )) + # Fetch learner_spec (BLOB) and learner_params (MAP) for the best rows via fetch() + # so that MAP columns are properly deserialized to named lists. + learner_id_csv <- paste0("'", best_model_ids$learner_id, "'", collapse = ", ") + best_specs <- self$fetch( + "trans_models_t", + cols = c("id_run", "id_trans", "learner_id", "learner_spec", "learner_params"), + where = glue::glue( + "id_run in ({toString(best_model_ids$id_run)}) and ", + "id_trans in ({toString(best_model_ids$id_trans)}) and ", + "learner_id in ({learner_id_csv})" + ) + ) + + # Join to add id_pred and build complete item list for workers + best_models <- best_model_ids[best_specs, on = c("id_run", "id_trans", "learner_id")] + message(glue::glue( "Fitting full models for {nrow(best_models)} transitions..." )) @@ -362,31 +405,33 @@ fit_full_models <- function( ) |> data.table::rbindlist() + # Fetch remaining columns from the best partial models and join partial_models <- self$fetch( "trans_models_t", cols = c( "id_run", "id_trans", - "fit_call", - "model_family", - "model_params", - "goodness_of_fit", - "model_obj_part" + "learner_id", + "learner_params", + "learner_spec", + "crossval_measures", + "crossval_predictions" ), where = glue::glue( "id_run in ({toString(full_models$id_run)}) and ", - "id_trans in ({toString(full_models$id_trans)})" + "id_trans in ({toString(full_models$id_trans)}) and ", + "learner_id in ({paste0(\"'\", full_models$learner_id, \"'\", collapse = \", \")})" ) ) full_models[ partial_models, - on = c("id_run", "id_trans", "fit_call"), + on = c("id_run", "id_trans", "learner_id"), `:=`( - model_family = i.model_family, - model_params = i.model_params, - goodness_of_fit = i.goodness_of_fit, - model_obj_part = i.model_obj_part + learner_params = i.learner_params, + learner_spec = i.learner_spec, + crossval_measures = i.crossval_measures, + crossval_predictions = i.crossval_predictions ) ] |> as_trans_models_t() @@ -402,12 +447,12 @@ validate.trans_models_t <- function(x, ...) { c( "id_run", "id_trans", - "model_family", - "model_params", - "goodness_of_fit", - "fit_call", - "model_obj_part", - "model_obj_full" + "learner_id", + "learner_params", + "learner_spec", + "crossval_measures", + "crossval_predictions", + "learner_full" ) ) @@ -419,14 +464,14 @@ validate.trans_models_t <- function(x, ...) { stopifnot( is.integer(x[["id_run"]]), is.integer(x[["id_trans"]]), - is.character(x[["model_family"]]), - is.list(x[["model_params"]]), - is.list(x[["goodness_of_fit"]]), - is.character(x[["fit_call"]]), - is.list(x[["model_obj_part"]]), - is.list(x[["model_obj_full"]]), + is.character(x[["learner_id"]]), + is.list(x[["learner_params"]]), + is.list(x[["learner_spec"]]), + is.list(x[["crossval_measures"]]), + is.list(x[["crossval_predictions"]]), + is.list(x[["learner_full"]]), all(x[["id_trans"]] > 0), - !any(x[["model_family"]] == "") + !any(x[["learner_id"]] == "") ) return(x) @@ -437,16 +482,16 @@ validate.trans_models_t <- function(x, ...) { print.trans_models_t <- function(x) { if (nrow(x) > 0) { n_trans <- data.table::uniqueN(x[["id_trans"]]) - model_families <- unique(x[["model_family"]]) - n_with_part_models <- sum(!vapply(x[["model_obj_part"]], is.null, logical(1))) - n_with_full_models <- sum(!vapply(x[["model_obj_full"]], is.null, logical(1))) + learner_ids <- unique(x[["learner_id"]]) + n_with_crossval <- sum(!vapply(x[["crossval_predictions"]], is.null, logical(1))) + n_with_full <- sum(!vapply(x[["learner_full"]], is.null, logical(1))) cat(glue::glue( "Transition Models Table\n", "Total models: {nrow(x)}\n", "Transitions: {n_trans}\n", - "Model families: {paste(model_families, collapse = ', ')}\n", - "With partial models: {n_with_part_models}, With full models: {n_with_full_models}\n\n" + "Learners: {paste(learner_ids, collapse = ', ')}\n", + "With cross-val predictions: {n_with_crossval}, With full models: {n_with_full}\n\n" )) } else { cat("Transition Models Table (empty)\n") @@ -454,3 +499,46 @@ print.trans_models_t <- function(x) { print_rowwise_yaml(x) invisible(x) } + +#' @describeIn trans_models_t Deserialize cross-validation predictions and return +#' plots via `mlr3viz::autoplot()`. Requires the `mlr3viz` package. +#' @param self [evoland_db] instance +#' @param id_run Optional integer; filter by run ID. +#' @param id_trans Optional integer; filter by transition ID. +get_crossval_plots <- function(self, id_run = NULL, id_trans = NULL) { + if (!requireNamespace("mlr3viz", quietly = TRUE)) { + stop("Package 'mlr3viz' is required. Install with: install.packages('mlr3viz')") + } + + where_clauses <- c() + if (!is.null(id_run)) { + where_clauses <- c(where_clauses, glue::glue("id_run = {id_run}")) + } + if (!is.null(id_trans)) { + where_clauses <- c(where_clauses, glue::glue("id_trans = {id_trans}")) + } + where <- if (length(where_clauses) > 0L) paste(where_clauses, collapse = " and ") else NULL + + models <- self$fetch( + "trans_models_t", + cols = c("id_run", "id_trans", "learner_id", "crossval_predictions"), + where = where + ) + + plots <- lapply(seq_len(nrow(models)), function(i) { + pred_blob <- models$crossval_predictions[[i]] + if (is.null(pred_blob)) { + return(NULL) + } + prediction <- qs2::qs_deserialize(pred_blob) + mlr3viz::autoplot(prediction) + }) + + names(plots) <- paste0( + "id_run=", models$id_run, + "_id_trans=", models$id_trans, + "_", models$learner_id + ) + + plots +} diff --git a/R/trans_pot_t.R b/R/trans_pot_t.R index 75e351e..d71a36b 100644 --- a/R/trans_pot_t.R +++ b/R/trans_pot_t.R @@ -113,10 +113,10 @@ predict_trans_pot <- function( # Get model for this transition model_row <- self$get_query(glue::glue( r"[ - select model_obj_full + select learner_full from {self$get_read_expr("trans_models_t")} where id_trans = {id_trans} - order by goodness_of_fit['{gof_criterion}'] {ifelse(gof_maximize, "desc", "asc")} + order by crossval_measures['{gof_criterion}'] {ifelse(gof_maximize, "desc", "asc")} limit 1 ]" )) @@ -125,8 +125,8 @@ predict_trans_pot <- function( stop(glue::glue("Expecting exactly one model for id_trans={id_trans}")) } - # Deserialize full model - model_obj <- qs2::qs_deserialize(model_row$model_obj_full[[1]]) + # Deserialize full learner + learner_obj <- qs2::qs_deserialize(model_row$learner_full[[1]]) # Get predictor data for id_period_post at coords with id_lulc_ant at id_period_post - 1 pred_data_post <- self$pred_data_wide_v( @@ -141,33 +141,10 @@ predict_trans_pot <- function( next } - # Predict probabilities - # Drop id_coord for prediction + # Predict probabilities using mlr3 predict_newdata; drop id_coord (non-feature) pred_cols <- grep("^id_pred_", names(pred_data_post), value = TRUE) - - # Predict - assuming model has predict() method that returns probabilities - # FIXME apparently not all models respect the standard function signature, pull in - # mlr3 - if (inherits(model_obj, "ranger")) { - probs <- - predict( - model_obj, - data = pred_data_post[, -"id_coord"], - type = "response" - )[[ - "predictions" # assume model has been run with probability = TRUE - ]][, - "TRUE" # matrix column for transition _does_ occur - ] - } else { - probs <- - predict( - model_obj, - newdata = pred_data_post[, -"id_coord"], - type = "response" - ) |> - setNames(NULL) - } + pred_feature_data <- pred_data_post[, .SD, .SDcols = pred_cols] + probs <- learner_obj$predict_newdata(pred_feature_data)$prob[, "TRUE"] # Ensure probabilities are in [0, 1] probs <- pmax(0, pmin(1, probs)) diff --git a/inst/tinytest/test_integ_allocation.R b/inst/tinytest/test_integ_allocation.R index 06884d7..0dd1429 100644 --- a/inst/tinytest/test_integ_allocation.R +++ b/inst/tinytest/test_integ_allocation.R @@ -5,6 +5,10 @@ if (!at_home()) { exit_file("Integration tests skipped (not at_home)") } +if (!requireNamespace("mlr3", quietly = TRUE)) { + exit_file("mlr3 not available; skipping allocation integration tests") +} + source(file.path(system.file("tinytest", package = "evoland"), "helper_testdb.R")) db <- make_test_db() db$trans_rates_t <- db$get_obs_trans_rates() @@ -13,18 +17,24 @@ db$trans_rates_t <- extrapolate_trans_rates( db$periods_t, coord_count = nrow(db$coords_t) ) -# test the package's standard glm quasibinomial fit and append to disk + +test_learner <- mlr3::lrn("classif.featureless", predict_type = "prob") +test_measures <- list(mlr3::msr("classif.auc")) + +# test the package's featureless learner fit and append to disk expect_message( db$trans_models_t <- db$fit_partial_models( - fit_fun = fit_glm, - gof_fun = gof_glm, + learner = test_learner, + measures = test_measures, seed = 1244244, ), "Fitting partial models for 2 transitions..." ) expect_message( db$trans_models_t <- db$fit_full_models( - gof_criterion = "auc", + learner = test_learner, + measures = test_measures, + gof_criterion = "classif.auc", gof_maximize = TRUE ), "Fitting full models for" diff --git a/inst/tinytest/test_integ_trans_models_t.R b/inst/tinytest/test_integ_trans_models_t.R index b0f972e..0308d57 100644 --- a/inst/tinytest/test_integ_trans_models_t.R +++ b/inst/tinytest/test_integ_trans_models_t.R @@ -7,19 +7,21 @@ expect_stdout(print(as_trans_models_t()), "Transition Models Table") trans_models_t <- as_trans_models_t(data.table::data.table( id_run = 1000L, id_trans = 1L, - model_family = "rf", - model_params = list( - list(depth = 100, ntrees = 500) + learner_id = "classif.featureless", + learner_params = list( + list(method = "mode") ), - goodness_of_fit = list( - list(auc = 0.8, rmse = 0.15) + learner_spec = list( + charToRaw("learner spec blob") ), - fit_call = "fit_fun(data = data)", - model_obj_part = list( - charToRaw("partial model data") + crossval_measures = list( + list(classif.auc = 0.8) ), - model_obj_full = list( - charToRaw("full model data") + crossval_predictions = list( + charToRaw("predictions blob") + ), + learner_full = list( + charToRaw("full learner blob") ) )) expect_equal(nrow(trans_models_t), 1L) @@ -30,6 +32,10 @@ if (!at_home()) { exit_file("Integration tests skipped (not at_home)") } +if (!requireNamespace("mlr3", quietly = TRUE)) { + exit_file("mlr3 not available; skipping integration tests") +} + # Load fixtures via helper source(file.path( system.file("tinytest", package = "evoland"), @@ -37,194 +43,118 @@ source(file.path( )) db <- make_test_db(include_neighbors = FALSE, include_trans_preds = TRUE) -# Test fit_partial_models and fit_full_models workflow -# Define a simple mock fit function for testing -fit_mock_glm <- function(data, ...) { - pred_cols <- grep("^id_pred_", names(data), value = TRUE) - - if (length(pred_cols) == 0) { - stop("No predictor columns found") - } - - # Create a simple formula - formula_str <- paste("did_transition", "~", paste(pred_cols, collapse = " + ")) - formula <- as.formula(formula_str) - - # Fit a simple GLM - model <- glm(formula, data = data, family = binomial()) - - return(model) -} - -# Define a goodness of fit function -gof_mock <- function(model, test_data) { - predictions <- predict(model, newdata = test_data, type = "response") - actual <- test_data[["did_transition"]] - - # Simple correlation-based metric - cor_metric <- cor(predictions, actual, use = "complete.obs") - - # Mean squared error - mse <- mean((predictions - actual)^2, na.rm = TRUE) - - list( - cor = cor_metric, - mse = mse, - n_test = nrow(test_data) - ) -} +# Use a simple featureless learner for fast, dependency-free testing +test_learner <- mlr3::lrn("classif.featureless", predict_type = "prob") +test_measures <- list(mlr3::msr("classif.auc"), mlr3::msr("classif.acc")) # Test fit_partial_models expect_message( db$trans_models_t <- partial_models <- db$fit_partial_models( - fit_fun = fit_mock_glm, - gof_fun = gof_mock, + learner = test_learner, + measures = test_measures, sample_frac = 0.7, - seed = 123, - other_param = "nonce" + seed = 123 ), "Fitting partial models for 2 transitions..." ) expect_length( partial_models, - 8L # columns; length is NULL if all fail -) -expect_match( - partial_models$fit_call, - r"{function \(data, ..., other_param = "nonce"\)}" -) -expect_equal( - partial_models$model_params, - list( - list(n_predictors = 4L, n_train = 514L, sample_frac = 0.7, other_param = "nonce"), - list(n_predictors = 4L, n_train = 748L, sample_frac = 0.7, other_param = "nonce") - ) + 8L # columns ) expect_true(all( - vapply(partial_models$model_obj_part, is.raw, logical(1)) + c("id_run", "id_trans", "learner_id", "learner_params", + "learner_spec", "crossval_measures", "crossval_predictions", "learner_full") %in% + names(partial_models) )) expect_equal( - partial_models$goodness_of_fit, - list( - list(cor = 0.0224227324563254, mse = 0.250509844417008, n_test = 219L), - list(cor = 0.0267497369302788, mse = 0.248782532059352, n_test = 319L) - ), - tolerance = 1e-6 + unique(partial_models$learner_id), + "classif.featureless" ) +expect_true(all( + vapply(partial_models$learner_spec, is.raw, logical(1)) +)) +expect_true(all( + vapply(partial_models$crossval_predictions, is.raw, logical(1)) +)) +expect_true(all( + vapply(partial_models$learner_full, is.null, logical(1)) +)) -# Test that model deserialization works -expect_inherits( - qs2::qs_deserialize(partial_models$model_obj_part[[1]]), - "glm" -) +# Test that learner_spec deserializes to an mlr3 Learner +deserialized_spec <- qs2::qs_deserialize(partial_models$learner_spec[[1]]) +expect_true(inherits(deserialized_spec, "Learner")) +expect_equal(deserialized_spec$id, "classif.featureless") -# Test fit_full_models, which ensures we can retrieve and evaluate the embedded model function -expect_message( - db$fit_full_models( - gof_criterion = "cor", - gof_maximize = TRUE, - ), - "Fitting full models for" -) +# crossval_measures should be named lists with measure IDs as keys +expect_true(all(vapply( + partial_models$crossval_measures, + function(m) !is.null(m) && is.list(m) && "classif.auc" %in% names(m), + logical(1) +))) -# test the package's standard rf fit and append to disk -expect_message( - db$trans_models_t <- db$fit_partial_models( - fit_fun = fit_ranger, - gof_fun = gof_ranger, - seed = 1244244, - ) -) -# test the package's standard glm quasibinomial fit -expect_message( - db$fit_partial_models( - fit_fun = fit_glm, - gof_fun = gof_glm, - seed = 1244244, - ) -) +# Test fit_full_models, which reconstructs and retrains on full data expect_message( db$trans_models_t <- full_models <- db$fit_full_models( - gof_criterion = "auc", + learner = test_learner, + measures = test_measures, + gof_criterion = "classif.auc", gof_maximize = TRUE ), "Fitting full models for" ) -# test DB round trip +# Test DB round trip expect_equal(nrow(full_models), 2L) -expect_equal(db$row_count("trans_models_t"), 4L) -full_mods_roundtrip <- db$trans_models_t[id_trans == 2L & model_family == "ranger"] -data.table::setattr( - full_mods_roundtrip, - "parquet_db_t_class", - NULL -) # remove attribute for testing equivalence +full_mods_roundtrip <- db$trans_models_t[id_trans == 2L & learner_id == "classif.featureless"] +data.table::setattr(full_mods_roundtrip, "parquet_db_t_class", NULL) expect_identical( full_mods_roundtrip, - full_models[id_trans == 2L & model_family == "ranger", ] + full_models[id_trans == 2L & learner_id == "classif.featureless"] ) -# Check that both partial and full models are present -expect_true(all(!vapply(full_models$model_obj_part, is.null, logical(1)))) -expect_true(all(!vapply(full_models$model_obj_full, is.null, logical(1)))) +# Check that both crossval_predictions and learner_full are present +expect_true(all(!vapply(full_models$crossval_predictions, is.null, logical(1)))) +expect_true(all(!vapply(full_models$learner_full, is.null, logical(1)))) -# Test that full model deserialization works -expect_inherits( - qs2::qs_deserialize(full_models$model_obj_full[[1]]), - "ranger" -) +# Test that learner_full deserializes to a trained mlr3 Learner +deserialized_full <- qs2::qs_deserialize(full_models$learner_full[[1]]) +expect_true(inherits(deserialized_full, "Learner")) +expect_false(is.null(deserialized_full$model)) # Test model selection with minimize criterion expect_message( full_models_min <- db$fit_full_models( - gof_criterion = "mse", + learner = test_learner, + measures = test_measures, + gof_criterion = "classif.acc", gof_maximize = FALSE ), "Fitting full models for" ) -# Test error handling - missing fit_fun parameter -expect_error( - db$fit_partial_models( - gof_fun = gof_mock, - sample_frac = 0.7 - ), - "argument \"fit_fun\" is missing" -) - -# Test error handling - missing gof_fun parameter +# Test error handling - missing learner parameter expect_error( db$fit_partial_models( - fit_fun = fit_mock_glm, + measures = test_measures, sample_frac = 0.7 ), - "argument \"gof_fun\" is missing" -) - -# Test error handling - fit_fun is not a function -expect_error( - db$fit_partial_models( - fit_fun = "not_a_function", - gof_fun = gof_mock - ), - "fit_fun must be a function" + "argument \"learner\" is missing" ) -# Test error handling - gof_fun is not a function +# Test error handling - learner is not an mlr3 Learner expect_error( db$fit_partial_models( - fit_fun = fit_mock_glm, - gof_fun = "not_a_function" + learner = "not_a_learner", + measures = test_measures ), - "gof_fun must be a function" + "learner must be an mlr3 Learner or AutoTuner" ) # Test error handling - invalid sample_frac expect_error( db$fit_partial_models( - fit_fun = fit_mock_glm, - gof_fun = gof_mock, + learner = test_learner, + measures = test_measures, sample_frac = 0 ), "sample_frac must be between 0 and 1" @@ -232,8 +162,8 @@ expect_error( expect_error( db$fit_partial_models( - fit_fun = fit_mock_glm, - gof_fun = gof_mock, + learner = test_learner, + measures = test_measures, sample_frac = 1 ), "sample_frac must be between 0 and 1" @@ -243,48 +173,55 @@ expect_error( db$delete_from("trans_models_t") expect_error( db$fit_full_models( - gof_criterion = "cor", + learner = test_learner, + measures = test_measures, + gof_criterion = "classif.auc", gof_maximize = TRUE ), "trans_models_t is missing" ) -# Test fit function that throws an error -fit_error <- function(data, ...) { - stop("Intentional error for testing") -} +# Test fit function that throws an error (simulate via a bad learner) +# We achieve this by running with a learner that fails (no viable data after +# deleting the table above means we re-populate and then use a bad setup). +# Instead, test via a mock that warns on error using an out-of-range sample_frac +# -> use a real error scenario by removing all preds +db2 <- make_test_db(include_neighbors = FALSE, include_trans_preds = FALSE) +db2$trans_preds_t <- as_trans_preds_t(data.table::data.table( + id_run = 0L, + id_pred = 99999L, # non-existent predictor + id_trans = 1L +)) expect_warning( partial_models_error <- - db$fit_partial_models( - fit_fun = fit_error, - gof_fun = gof_mock, + db2$fit_partial_models( + learner = test_learner, + measures = test_measures, sample_frac = 0.7, seed = 123 ), - "Intentional error for testing" + "No predictor columns|No data" ) - -# Should return NULL when all transitions fail expect_equal( - partial_models_error, - as_trans_models_t(data.table::data.table( - id_run = 0L, - id_trans = 1:2, - model_family = "error", - model_params = list(NULL), - goodness_of_fit = list( - list(failed = TRUE, message = "Intentional error for testing"), - list(failed = TRUE, message = "Intentional error for testing") - ), - fit_call = "function (data, ...) \n {\n stop(\"Intentional error for testing\")\n }", - model_obj_part = list(NULL), - model_obj_full = list(NULL) - )) + partial_models_error$learner_id, + "error" ) # Test print method +db3 <- make_test_db(include_neighbors = FALSE, include_trans_preds = TRUE) +db3$trans_models_t <- db3$fit_partial_models( + learner = test_learner, + measures = test_measures, + seed = 42 +) +db3$trans_models_t <- db3$fit_full_models( + learner = test_learner, + measures = test_measures, + gof_criterion = "classif.auc", + gof_maximize = TRUE +) expect_stdout( - print(full_models), + print(db3$trans_models_t), "Transition Models Table|Total models" ) From dcf8d494c5a2a69d3593f2de384ef81ef3dcef78 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 16 Apr 2026 13:38:08 +0000 Subject: [PATCH 03/15] Rename l_id/l_params/l_spec to clearer names per code review Agent-Logs-Url: https://github.com/ethzplus/evoland-plus/sessions/16192889-e909-4511-bc6c-4fc80926b365 Co-authored-by: mmyrte <24587121+mmyrte@users.noreply.github.com> --- R/trans_models_t.R | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/R/trans_models_t.R b/R/trans_models_t.R index cdf6adf..6a73873 100644 --- a/R/trans_models_t.R +++ b/R/trans_models_t.R @@ -139,20 +139,20 @@ fit_partial_model_worker <- function( trained_learner } - l_id <- extract_from$id - l_params <- Filter( + learner_id_val <- extract_from$id + learner_params_val <- Filter( function(v) is.atomic(v) && length(v) == 1L, extract_from$param_set$values ) - l_params <- if (length(l_params) == 0L) NULL else l_params - l_spec <- qs2::qs_serialize(extract_from$clone(deep = TRUE)$reset()) + learner_params_val <- if (length(learner_params_val) == 0L) NULL else learner_params_val + learner_spec_blob <- qs2::qs_serialize(extract_from$clone(deep = TRUE)$reset()) data.table::data.table( id_run = item[["id_run"]], id_trans = item[["id_trans"]], - learner_id = l_id, - learner_params = list(l_params), - learner_spec = list(l_spec), + learner_id = learner_id_val, + learner_params = list(learner_params_val), + learner_spec = list(learner_spec_blob), crossval_measures = list(scores), crossval_predictions = list(qs2::qs_serialize(prediction)), learner_full = list(NULL) @@ -210,17 +210,17 @@ fit_full_model_worker <- function(item, db, ...) { # Reconstruct learner: try learner_spec first, fall back to do.call(lrn, ...) learner_spec_raw <- item[["learner_spec"]][[1L]] - l_id <- item[["learner_id"]] - l_params <- item[["learner_params"]][[1L]] + learner_id_val <- item[["learner_id"]] + learner_params_val <- item[["learner_params"]][[1L]] trained_learner <- tryCatch( qs2::qs_deserialize(learner_spec_raw), error = function(e) { warning(glue::glue( - "learner_spec deserialization failed for {l_id}: {e$message}; ", + "learner_spec deserialization failed for {learner_id_val}: {e$message}; ", "falling back to do.call reconstruction" )) - do.call(mlr3::lrn, c(list(l_id), as.list(l_params))) + do.call(mlr3::lrn, c(list(learner_id_val), as.list(learner_params_val))) } ) @@ -229,7 +229,7 @@ fit_full_model_worker <- function(item, db, ...) { list( id_run = item[["id_run"]], id_trans = item[["id_trans"]], - learner_id = l_id, + learner_id = learner_id_val, learner_full = list(qs2::qs_serialize(trained_learner)) ) }, From f6555a6dff4f8049db8ac8b0ac65a6e50c071df5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 17 Apr 2026 21:33:02 +0000 Subject: [PATCH 04/15] Address review: crossval_score rename, mlr3 to Imports, warning, docs, char measures, no-copy predict Agent-Logs-Url: https://github.com/ethzplus/evoland-plus/sessions/0bd7d123-2149-49b7-b96d-7fb7ff71f3ac Co-authored-by: mmyrte <24587121+mmyrte@users.noreply.github.com> --- DESCRIPTION | 2 +- R/trans_models_t.R | 51 +++++++++++++++-------- R/trans_pot_t.R | 8 ++-- inst/tinytest/test_integ_trans_models_t.R | 11 ++--- 4 files changed, 43 insertions(+), 29 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index fac40ea..622ea9c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,6 +19,7 @@ Imports: DBI, duckdb (>= 1.5.2), glue, + mlr3, qs2, R6, Rcpp, @@ -26,7 +27,6 @@ Imports: Suggests: base64enc, butcher, - mlr3, mlr3viz, pROC, processx, diff --git a/R/trans_models_t.R b/R/trans_models_t.R index 6a73873..d1e6aa5 100644 --- a/R/trans_models_t.R +++ b/R/trans_models_t.R @@ -16,7 +16,7 @@ #' querying; complete hyperparameters are captured by `learner_spec` #' - `learner_spec`: BLOB of serialized untrained mlr3 `Learner`; for #' AutoTuners, this is the optimal inner learner after tuning -#' - `crossval_measures`: MAP of cross-validation performance scores +#' - `crossval_score`: MAP of cross-validation performance scores #' (from `prediction$score(measures)`) #' - `crossval_predictions`: BLOB of serialized mlr3 `PredictionClassif` #' on the held-out test split @@ -31,7 +31,7 @@ as_trans_models_t <- function(x) { learner_id = character(0), learner_params = list(), learner_spec = list(), - crossval_measures = list(), + crossval_score = list(), crossval_predictions = list(), learner_full = list() ) @@ -45,7 +45,7 @@ as_trans_models_t <- function(x) { x, "trans_models_t", key_cols = c("id_run", "id_trans", "learner_id"), - map_cols = c("learner_params", "crossval_measures"), + map_cols = c("learner_params", "crossval_score"), partition_cols = "id_run" ) } @@ -153,7 +153,7 @@ fit_partial_model_worker <- function( learner_id = learner_id_val, learner_params = list(learner_params_val), learner_spec = list(learner_spec_blob), - crossval_measures = list(scores), + crossval_score = list(scores), crossval_predictions = list(qs2::qs_serialize(prediction)), learner_full = list(NULL) ) @@ -168,7 +168,7 @@ fit_partial_model_worker <- function( learner_id = "error", learner_params = list(NULL), learner_spec = list(NULL), - crossval_measures = list(NULL), + crossval_score = list(NULL), crossval_predictions = list(NULL), learner_full = list(NULL) ) @@ -216,11 +216,12 @@ fit_full_model_worker <- function(item, db, ...) { trained_learner <- tryCatch( qs2::qs_deserialize(learner_spec_raw), error = function(e) { + fallback <- do.call(mlr3::lrn, c(list(learner_id_val), as.list(learner_params_val))) warning(glue::glue( "learner_spec deserialization failed for {learner_id_val}: {e$message}; ", - "falling back to do.call reconstruction" + "falling back to reconstructed learner: {fallback$format()}" )) - do.call(mlr3::lrn, c(list(learner_id_val), as.list(learner_params_val))) + fallback } ) @@ -254,8 +255,10 @@ fit_full_model_worker <- function(item, db, ...) { #' @param learner An mlr3 `Learner` or `AutoTuner` object. A deep clone is trained #' for each transition; the original object is not modified. For `AutoTuner`, #' the optimal inner learner is extracted after tuning. -#' @param measures A list of mlr3 `Measure` objects (e.g. `list(mlr3::msr("classif.auc"))`) -#' used to score the held-out predictions; results are written to `crossval_measures`. +#' @param measures Either a character vector of mlr3 measure IDs +#' (e.g. `c("classif.auc", "classif.acc")`) or a list of instantiated mlr3 +#' `Measure` objects (e.g. `list(mlr3::msr("classif.auc"))`). Character IDs are +#' converted via `mlr3::msrs()` internally. Results are written to `crossval_score`. #' @param sample_frac Numeric between 0 and 1 indicating #' the fraction of data to use for training the partial models. The rest is used for #' testing and calculating goodness-of-fit metrics. Default is 0.7 (70% training, 30% @@ -263,6 +266,9 @@ fit_full_model_worker <- function(item, db, ...) { #' @param seed Optional integer seed for reproducible subsampling. #' @param cluster An optional cluster object created by [parallel::makeCluster()] or #' [mirai::make_cluster()]. +#' @return A [trans_models_t] table with one row per viable transition, containing +#' the learner identity, serialized spec, cross-validation scores (`crossval_score`), +#' and serialized held-out predictions (`crossval_predictions`). fit_partial_models <- function( self, learner, @@ -271,6 +277,11 @@ fit_partial_models <- function( seed = NULL, cluster = NULL ) { + # Accept either a character vector of measure IDs or a list of Measure objects + if (is.character(measures)) { + measures <- mlr3::msrs(measures) + } + trans_preds_nested <- data.table::as.data.table(self$trans_preds_t)[, .(id_pred = list(id_pred)), @@ -289,7 +300,7 @@ fit_partial_models <- function( stopifnot( "No viable transitions" = nrow(viable_trans) > 0L, "learner must be an mlr3 Learner or AutoTuner" = inherits(learner, "Learner"), - "measures must be a non-empty list" = is.list(measures) && length(measures) > 0L, + "measures must be a non-empty character vector or list" = (is.list(measures) || is.character(measures)) && length(measures) > 0L, "sample_frac must be between 0 and 1" = sample_frac > 0 && sample_frac < 1 ) @@ -320,14 +331,18 @@ fit_partial_models <- function( #' @param learner An mlr3 `Learner` or `AutoTuner` object; kept for API consistency and #' used as a last-resort fallback if both `learner_spec` deserialization and #' `do.call(mlr3::lrn, ...)` reconstruction fail. -#' @param measures A list of mlr3 `Measure` objects; kept for API consistency. -#' @param gof_criterion Character string specifying which cross-validation measure to use +#' @param measures Either a character vector of mlr3 measure IDs or a list of `Measure` +#' objects; kept for API consistency. +#' @param gof_criterion Character string specifying which cross-validation score to use #' for selecting the best partial model per transition (must match a key in -#' `crossval_measures`, e.g. `"classif.auc"`). +#' `crossval_score`, e.g. `"classif.auc"`). #' @param gof_maximize Logical; select the model with the maximum (`TRUE`) or minimum #' (`FALSE`) value of `gof_criterion`. Default is `TRUE`. #' @param cluster An optional cluster object created by [parallel::makeCluster()] or #' [mirai::make_cluster()]. +#' @return A [trans_models_t] table with one row per transition, containing the columns +#' from the best partial model plus `learner_full` with the serialized fully-trained +#' learner. fit_full_models <- function( self, learner, @@ -369,7 +384,7 @@ fit_full_models <- function( and pn.id_trans = tm.id_trans qualify row_number() over ( partition by tm.id_run, tm.id_trans - order by tm.crossval_measures['{gof_criterion}'] {ifelse(gof_maximize, "desc", "asc")} + order by tm.crossval_score['{gof_criterion}'] {ifelse(gof_maximize, "desc", "asc")} ) = 1; ]" )) @@ -414,7 +429,7 @@ fit_full_models <- function( "learner_id", "learner_params", "learner_spec", - "crossval_measures", + "crossval_score", "crossval_predictions" ), where = glue::glue( @@ -430,7 +445,7 @@ fit_full_models <- function( `:=`( learner_params = i.learner_params, learner_spec = i.learner_spec, - crossval_measures = i.crossval_measures, + crossval_score = i.crossval_score, crossval_predictions = i.crossval_predictions ) ] |> @@ -450,7 +465,7 @@ validate.trans_models_t <- function(x, ...) { "learner_id", "learner_params", "learner_spec", - "crossval_measures", + "crossval_score", "crossval_predictions", "learner_full" ) @@ -467,7 +482,7 @@ validate.trans_models_t <- function(x, ...) { is.character(x[["learner_id"]]), is.list(x[["learner_params"]]), is.list(x[["learner_spec"]]), - is.list(x[["crossval_measures"]]), + is.list(x[["crossval_score"]]), is.list(x[["crossval_predictions"]]), is.list(x[["learner_full"]]), all(x[["id_trans"]] > 0), diff --git a/R/trans_pot_t.R b/R/trans_pot_t.R index d71a36b..f158d4e 100644 --- a/R/trans_pot_t.R +++ b/R/trans_pot_t.R @@ -116,7 +116,7 @@ predict_trans_pot <- function( select learner_full from {self$get_read_expr("trans_models_t")} where id_trans = {id_trans} - order by crossval_measures['{gof_criterion}'] {ifelse(gof_maximize, "desc", "asc")} + order by crossval_score['{gof_criterion}'] {ifelse(gof_maximize, "desc", "asc")} limit 1 ]" )) @@ -141,10 +141,8 @@ predict_trans_pot <- function( next } - # Predict probabilities using mlr3 predict_newdata; drop id_coord (non-feature) - pred_cols <- grep("^id_pred_", names(pred_data_post), value = TRUE) - pred_feature_data <- pred_data_post[, .SD, .SDcols = pred_cols] - probs <- learner_obj$predict_newdata(pred_feature_data)$prob[, "TRUE"] + # Predict probabilities using mlr3 predict_newdata; id_coord is dropped automatically + probs <- learner_obj$predict_newdata(pred_data_post)$prob[, "TRUE"] # Ensure probabilities are in [0, 1] probs <- pmax(0, pmin(1, probs)) diff --git a/inst/tinytest/test_integ_trans_models_t.R b/inst/tinytest/test_integ_trans_models_t.R index 0308d57..2b56ecc 100644 --- a/inst/tinytest/test_integ_trans_models_t.R +++ b/inst/tinytest/test_integ_trans_models_t.R @@ -14,7 +14,7 @@ trans_models_t <- as_trans_models_t(data.table::data.table( learner_spec = list( charToRaw("learner spec blob") ), - crossval_measures = list( + crossval_score = list( list(classif.auc = 0.8) ), crossval_predictions = list( @@ -45,7 +45,8 @@ db <- make_test_db(include_neighbors = FALSE, include_trans_preds = TRUE) # Use a simple featureless learner for fast, dependency-free testing test_learner <- mlr3::lrn("classif.featureless", predict_type = "prob") -test_measures <- list(mlr3::msr("classif.auc"), mlr3::msr("classif.acc")) +# measures can be passed as a character vector of IDs (convenience) or as a list of Measure objects +test_measures <- c("classif.auc", "classif.acc") # Test fit_partial_models expect_message( @@ -63,7 +64,7 @@ expect_length( ) expect_true(all( c("id_run", "id_trans", "learner_id", "learner_params", - "learner_spec", "crossval_measures", "crossval_predictions", "learner_full") %in% + "learner_spec", "crossval_score", "crossval_predictions", "learner_full") %in% names(partial_models) )) expect_equal( @@ -85,9 +86,9 @@ deserialized_spec <- qs2::qs_deserialize(partial_models$learner_spec[[1]]) expect_true(inherits(deserialized_spec, "Learner")) expect_equal(deserialized_spec$id, "classif.featureless") -# crossval_measures should be named lists with measure IDs as keys +# crossval_score should be named lists with measure IDs as keys expect_true(all(vapply( - partial_models$crossval_measures, + partial_models$crossval_score, function(m) !is.null(m) && is.list(m) && "classif.auc" %in% names(m), logical(1) ))) From d7393dff698155f1a6bbfbd11ad911ad6f6f8caa Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 17 Apr 2026 21:33:54 +0000 Subject: [PATCH 05/15] Fix fallback format() collapse and strengthen measures validation Agent-Logs-Url: https://github.com/ethzplus/evoland-plus/sessions/0bd7d123-2149-49b7-b96d-7fb7ff71f3ac Co-authored-by: mmyrte <24587121+mmyrte@users.noreply.github.com> --- R/trans_models_t.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/R/trans_models_t.R b/R/trans_models_t.R index d1e6aa5..ccd994c 100644 --- a/R/trans_models_t.R +++ b/R/trans_models_t.R @@ -219,7 +219,7 @@ fit_full_model_worker <- function(item, db, ...) { fallback <- do.call(mlr3::lrn, c(list(learner_id_val), as.list(learner_params_val))) warning(glue::glue( "learner_spec deserialization failed for {learner_id_val}: {e$message}; ", - "falling back to reconstructed learner: {fallback$format()}" + "falling back to reconstructed learner: {paste(fallback$format(), collapse = ' ')}" )) fallback } @@ -300,7 +300,10 @@ fit_partial_models <- function( stopifnot( "No viable transitions" = nrow(viable_trans) > 0L, "learner must be an mlr3 Learner or AutoTuner" = inherits(learner, "Learner"), - "measures must be a non-empty character vector or list" = (is.list(measures) || is.character(measures)) && length(measures) > 0L, + "measures must be a non-empty character vector or list of Measure objects" = ( + (is.character(measures) || (is.list(measures) && all(vapply(measures, inherits, logical(1), "Measure")))) && + length(measures) > 0L + ), "sample_frac must be between 0 and 1" = sample_frac > 0 && sample_frac < 1 ) From 9930643ea37bee1e66430dadbc9eb2ffb6bd3761 Mon Sep 17 00:00:00 2001 From: Jan Hartman <24587121+mmyrte@users.noreply.github.com> Date: Sat, 18 Apr 2026 09:38:16 +0200 Subject: [PATCH 06/15] air autoformat Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- R/trans_models_t.R | 18 +++++++++++------- inst/tinytest/test_integ_trans_models_t.R | 12 ++++++++++-- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/R/trans_models_t.R b/R/trans_models_t.R index ccd994c..1ec4bfc 100644 --- a/R/trans_models_t.R +++ b/R/trans_models_t.R @@ -300,10 +300,11 @@ fit_partial_models <- function( stopifnot( "No viable transitions" = nrow(viable_trans) > 0L, "learner must be an mlr3 Learner or AutoTuner" = inherits(learner, "Learner"), - "measures must be a non-empty character vector or list of Measure objects" = ( - (is.character(measures) || (is.list(measures) && all(vapply(measures, inherits, logical(1), "Measure")))) && - length(measures) > 0L - ), + "measures must be a non-empty character vector or list of Measure objects" = ((is.character( + measures + ) || + (is.list(measures) && all(vapply(measures, inherits, logical(1), "Measure")))) && + length(measures) > 0L), "sample_frac must be between 0 and 1" = sample_frac > 0 && sample_frac < 1 ) @@ -553,9 +554,12 @@ get_crossval_plots <- function(self, id_run = NULL, id_trans = NULL) { }) names(plots) <- paste0( - "id_run=", models$id_run, - "_id_trans=", models$id_trans, - "_", models$learner_id + "id_run=", + models$id_run, + "_id_trans=", + models$id_trans, + "_", + models$learner_id ) plots diff --git a/inst/tinytest/test_integ_trans_models_t.R b/inst/tinytest/test_integ_trans_models_t.R index 2b56ecc..5600065 100644 --- a/inst/tinytest/test_integ_trans_models_t.R +++ b/inst/tinytest/test_integ_trans_models_t.R @@ -63,8 +63,16 @@ expect_length( 8L # columns ) expect_true(all( - c("id_run", "id_trans", "learner_id", "learner_params", - "learner_spec", "crossval_score", "crossval_predictions", "learner_full") %in% + c( + "id_run", + "id_trans", + "learner_id", + "learner_params", + "learner_spec", + "crossval_score", + "crossval_predictions", + "learner_full" + ) %in% names(partial_models) )) expect_equal( From cf15553161211d7610562eb12d618a9141f6e6a4 Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Tue, 21 Apr 2026 19:58:33 +0200 Subject: [PATCH 07/15] more concise AGENTS.md --- .gitignore | 1 - AGENTS.md | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++ rproject.toml | 16 ++++++++++ 3 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 AGENTS.md create mode 100644 rproject.toml diff --git a/.gitignore b/.gitignore index 1ebed01..147b726 100644 --- a/.gitignore +++ b/.gitignore @@ -38,7 +38,6 @@ vignettes/*.pdf # virtual environments renv* rv* -rproject.toml # ignore duckdb database files and parquet *db diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..77b2634 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,84 @@ +# Development instructions + +If you are a human, take these instructions as guidance. +If you are an agent, follow these instructions. +Ask for clarifications where specifications are unclear or edge cases crop up. + +This is an R package that is used for land use / land cover change analyses and simulations. +It builds on the approaches of [lulcc](https://github.com/simonmoulds/lulcc), [clumpy](https://github.com/mmyrte/clumpy), and [Dinamica EGO](https://dinamicaego.com/). + +## Project layout + +- `R/parquet_db.R`, `R/parquet_db_utils.R`: generic parquet-backed DB layer +- `R/evoland_db.R`, `R/evoland_db_util.R`, `R/evoland_db_views.R`: domain DB and views (inherits parquet_db); register new methods using `create_method_binding` in `evoland_db` constructor +- `R/*_t.R`: table type definitions (`as_*_t` constructors) +- `R/util*.R`: internal helpers +- `R/*.R` (remaining): domain logic (allocation, filtering, modelling) +- `src/`: Rcpp / C++ code +- `inst/tinytest/`: tests +- `inst/dinamica_models/`: Dinamica EGO models +- `inst/*.sql`: SQL views; use glue for string interpolation +- `data-raw/`: raw data and test fixture generation scripts +- `man/`: documentation generated by roxygen2 +- `vignettes/*.qmd`: vignettes and how-tos + +## Environment setup + +Use [rv](https://a2-ai.github.io/rv-docs/). Run `rv sync` to install all dependencies (see `rproject.toml`). +Recompile and attach the package using `pkgload::load_all()`. + +## Conventions and Style + +Prefer base R solutions over external dependencies where they are equally readable. +Avoid new dependencies. +Avoid packages from the tidyverse, as their APIs tend to be unstable. +Avoid niche packages that are seldom maintained; if the functionality is simple, rather implement it as a non-exported utility function. + +Prefer exact double bracket `[["name"]]` list subsetting over `$name`, which does partial matching. + +Use `air` as autoformatter (`./air.toml`). +Use `lintr` for linting (`./.lintr`). +If running in an IDE, rely on language server notes. +Prefer explanatory names over comments in your code. + +Let errors propagate naturally; only add early guards when the default error message would confuse the user. +Use the `stopifnot("error message" = condition)` pattern for assertions. + +## Testing + +Use tinytest, see `inst/tinytest`. +Non-exported functions need to be tested as `evoland:::private_function`, because a package can be tested after installation. +Be parsimonious when constructing tests. + +- Test the full package using `R -e "tinytest::build_test_install()"`. +- Test individual files using `R -e "pkgload::load_all(); tinytest::run_test_file('inst/tinytest/somefile.R')"` + +## Documentation + +Use markdown roxygen for function and class documentation. +Use `R -e "roxygen2::roxygenize()"` to synchronize roxygen comments and `.Rd` files. +Writte tutorials and how-to as quarto files in `vignettes/*.qmd`. + +## Rcpp components + +C++ code is in the `src/` folder. +This code interfaces with R using Rcpp. +Build binaries using `pkgbuild::build()`. +Clean build objects using `pkgbuild::clean_dll()`. +Prefer `Rcpp` types. +Only write Rcpp-free headers when the code should also compile as a standalone program. + +## Database + +- Storage is done in parquet files written and read via an in-memory DuckDB instance. +- `R/parquet_db.R` specifies an R6 class that provides database operations to write and retrieve `data.table` objects to parquet files. + - `parquet_db_t` is a subclass of the `data.table` S3 class, see `R/parquet_db_utils.R` + - `parquet_db_t` objects can hold attributes used to define + - key columns, i.e. uniqueness columns + - hive partitioning columns + - map columns, i.e. R list columns of named lists translated to DuckDB MAP columns +- Domain specific database elements are in `R/evoland_db.R`; `evoland_db` inherits from `parquet_db`. + - The schema for this database is (for now) distributed across the class definitions: all `R/*_t.R` files contain `as_*_t` class constructors using `as_parquet_db_t`. + - Because the parquet files may be written to from external tools, they should be considered part of the API. Schema changes should be avoided as much as possible. + - Ad-hoc views are suffixed `_v` and generally exposed as active bindings, or as methods if they are parameterized. + - Every new method must be added in `R/evoland_db.R` and _not_ with `$set`. This is because the roxygen documentation routine for R6 objects relies on all documentation being available in a single file. diff --git a/rproject.toml b/rproject.toml new file mode 100644 index 0000000..e21002e --- /dev/null +++ b/rproject.toml @@ -0,0 +1,16 @@ +[project] +name = "evoland-plus" +r_version = "4.5" + +repositories = [ + { alias = "CRAN", url = "https://stat.ethz.ch/CRAN/" }, +] + +dependencies = [ + { name = "evoland", path = ".", install_suggestions = true }, + # dev dependencies + "devtools", + "mirai", + "httpgd", + "languageserver", +] From 8db92eefec51f6d184805cf6091d5cf8a4017e14 Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Tue, 21 Apr 2026 19:59:10 +0200 Subject: [PATCH 08/15] slightly edit comments --- R/trans_models_t.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/trans_models_t.R b/R/trans_models_t.R index 1ec4bfc..1c8dfe6 100644 --- a/R/trans_models_t.R +++ b/R/trans_models_t.R @@ -108,11 +108,12 @@ fit_partial_model_worker <- function( sample(idx_false, n_train_false) ) - # Subset to task columns (did_transition + predictors) and coerce target + # Subset to task columns (did_transition + predictors) task_cols <- c("did_transition", pred_cols) train_data <- trans_pred_data_full[train_idx, .SD, .SDcols = task_cols] test_data <- trans_pred_data_full[-train_idx, .SD, .SDcols = task_cols] + # Coerce target; mlr3 uses factors internally also for twoclass classification train_data[, did_transition := factor(did_transition, levels = c("FALSE", "TRUE"))] test_data[, did_transition := factor(did_transition, levels = c("FALSE", "TRUE"))] From 98394024f3e36c90c33c2fc1a5934080a983d380 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 21 Apr 2026 18:41:29 +0000 Subject: [PATCH 09/15] Refactor fit_full_models: two modes, select_score/select_maximize, simplified fetch; fix docs/guards/tests Agent-Logs-Url: https://github.com/ethzplus/evoland-plus/sessions/44f0fa02-3b76-45d5-a99b-1d20977bcc9e Co-authored-by: mmyrte <24587121+mmyrte@users.noreply.github.com> --- R/evoland_db.R | 22 +- R/trans_models_t.R | 318 ++++++++++++---------- inst/tinytest/test_integ_allocation.R | 30 +- inst/tinytest/test_integ_trans_models_t.R | 76 ++++-- 4 files changed, 264 insertions(+), 182 deletions(-) diff --git a/R/evoland_db.R b/R/evoland_db.R index ba0064d..0693518 100644 --- a/R/evoland_db.R +++ b/R/evoland_db.R @@ -171,19 +171,19 @@ evoland_db <- R6::R6Class( }, #' @description - #' Fit full models on complete data using the best partial model configuration for - #' each transition, see [fit_full_models()] - #' @param learner An mlr3 `Learner` or `AutoTuner` object; used as last-resort - #' fallback for reconstruction. - #' @param measures A list of mlr3 `Measure` objects; kept for API consistency. - #' @param gof_criterion Which cross-validation measure to use for model selection (e.g., `"classif.auc"`) - #' @param gof_maximize Maximize (TRUE) or minimize (FALSE) the gof_criterion? + #' Fit full models (trained on the complete dataset) for each viable transition, + #' see [fit_full_models()]. Two mutually exclusive modes: pass `learner` to train + #' directly, or pass `select_score` to pick the best partial model by score. + #' @param learner An mlr3 `Learner` or `AutoTuner` for direct-learner mode (`NULL` + #' when `select_score` is used). + #' @param select_score Measure ID string for score-select mode, e.g. `"classif.auc"` + #' (`NULL` when `learner` is used). + #' @param select_maximize Logical; maximize (`TRUE`) or minimize (`FALSE`) the score. #' @param cluster Optional cluster object for parallel processing fit_full_models = function( - learner, - measures, - gof_criterion, - gof_maximize, + learner = NULL, + select_score = NULL, + select_maximize = TRUE, cluster = NULL ) { create_method_binding(fit_full_models) diff --git a/R/trans_models_t.R b/R/trans_models_t.R index 1c8dfe6..b42421e 100644 --- a/R/trans_models_t.R +++ b/R/trans_models_t.R @@ -11,7 +11,8 @@ #' @return A data.table of class "trans_models_t" with columns: #' - `id_run`: Foreign key to runs_t #' - `id_trans`: Foreign key to trans_meta_t -#' - `learner_id`: mlr3 learner key, e.g. `"classif.ranger"` +#' - `learner_id`: mlr3 twoclass [LearnerClassif](https://mlr3.mlr-org.com/reference/LearnerClassif.html) +#' key, e.g. `"classif.ranger"` #' - `learner_params`: MAP of atomic scalar learner hyperparameters for #' querying; complete hyperparameters are captured by `learner_spec` #' - `learner_spec`: BLOB of serialized untrained mlr3 `Learner`; for @@ -60,10 +61,6 @@ fit_partial_model_worker <- function( seed = NULL, sample_frac = 0.7 ) { - if (!requireNamespace("mlr3", quietly = TRUE)) { - stop("Package 'mlr3' is required. Install with: install.packages('mlr3')") - } - id_run_orig <- db$id_run on.exit(db$id_run <- id_run_orig, add = TRUE) db$id_run <- item[["id_run"]] @@ -178,12 +175,11 @@ fit_partial_model_worker <- function( } # Worker function for full model fitting -# Not exported; used internally by fit_full_models -fit_full_model_worker <- function(item, db, ...) { - if (!requireNamespace("mlr3", quietly = TRUE)) { - stop("Package 'mlr3' is required. Install with: install.packages('mlr3')") - } - +# Not exported; used internally by fit_full_models. +# Operates in two modes depending on whether `learner` is NULL: +# - direct mode (learner != NULL): train the passed learner clone on full data +# - score-select mode (learner == NULL): reconstruct from item$learner_spec and retrain +fit_full_model_worker <- function(item, db, learner = NULL, ...) { tryCatch( { # Fetch full data @@ -209,22 +205,39 @@ fit_full_model_worker <- function(item, db, ...) { positive = "TRUE" ) - # Reconstruct learner: try learner_spec first, fall back to do.call(lrn, ...) - learner_spec_raw <- item[["learner_spec"]][[1L]] - learner_id_val <- item[["learner_id"]] - learner_params_val <- item[["learner_params"]][[1L]] - - trained_learner <- tryCatch( - qs2::qs_deserialize(learner_spec_raw), - error = function(e) { - fallback <- do.call(mlr3::lrn, c(list(learner_id_val), as.list(learner_params_val))) - warning(glue::glue( - "learner_spec deserialization failed for {learner_id_val}: {e$message}; ", - "falling back to reconstructed learner: {paste(fallback$format(), collapse = ' ')}" - )) - fallback - } - ) + if (!is.null(learner)) { + # Direct mode: use the passed learner directly + trained_learner <- learner$clone(deep = TRUE) + learner_id_val <- trained_learner$id + learner_params_val <- Filter( + function(v) is.atomic(v) && length(v) == 1L, + trained_learner$param_set$values + ) + learner_params_val <- if (length(learner_params_val) == 0L) NULL else learner_params_val + learner_spec_blob <- qs2::qs_serialize(trained_learner$clone(deep = TRUE)$reset()) + crossval_score_val <- list(NULL) + crossval_predictions_val <- list(NULL) + } else { + # Score-select mode: reconstruct from learner_spec; fall back to do.call + learner_spec_raw <- item[["learner_spec"]][[1L]] + learner_id_val <- item[["learner_id"]] + learner_params_val <- item[["learner_params"]][[1L]] + trained_learner <- tryCatch( + qs2::qs_deserialize(learner_spec_raw), + error = function(e) { + fallback <- do.call(mlr3::lrn, c(list(learner_id_val), as.list(learner_params_val))) + warning(glue::glue( + "learner_spec deserialization failed for {learner_id_val}: {e$message}; ", + "falling back to reconstructed learner: {paste(fallback$format(), collapse = ' ')}" + )) + fallback + } + ) + learner_spec_blob <- learner_spec_raw + # Pass through cross-validation results from the partial model + crossval_score_val <- item[["crossval_score"]] + crossval_predictions_val <- item[["crossval_predictions"]] + } trained_learner$train(full_task) @@ -232,6 +245,10 @@ fit_full_model_worker <- function(item, db, ...) { id_run = item[["id_run"]], id_trans = item[["id_trans"]], learner_id = learner_id_val, + learner_params = list(learner_params_val), + learner_spec = list(learner_spec_blob), + crossval_score = crossval_score_val, + crossval_predictions = crossval_predictions_val, learner_full = list(qs2::qs_serialize(trained_learner)) ) }, @@ -242,7 +259,11 @@ fit_full_model_worker <- function(item, db, ...) { list( id_run = item[["id_run"]], id_trans = item[["id_trans"]], - learner_id = item[["learner_id"]], + learner_id = if (!is.null(learner)) learner$id else item[["learner_id"]], + learner_params = list(NULL), + learner_spec = list(NULL), + crossval_score = list(NULL), + crossval_predictions = list(NULL), learner_full = list(NULL) ) } @@ -251,7 +272,9 @@ fit_full_model_worker <- function(item, db, ...) { #' @describeIn trans_models_t Fit partial (cross-validation) models for each viable -#' transition and store results in a trans_models_t table. +#' transition; returns a [trans_models_t] object with one row per viable transition, +#' containing the learner identity, serialized spec, cross-validation scores +#' (`crossval_score`), and serialized held-out predictions (`crossval_predictions`). #' @param self [evoland_db] instance to query for transitions and predictor data #' @param learner An mlr3 `Learner` or `AutoTuner` object. A deep clone is trained #' for each transition; the original object is not modified. For `AutoTuner`, @@ -267,9 +290,6 @@ fit_full_model_worker <- function(item, db, ...) { #' @param seed Optional integer seed for reproducible subsampling. #' @param cluster An optional cluster object created by [parallel::makeCluster()] or #' [mirai::make_cluster()]. -#' @return A [trans_models_t] table with one row per viable transition, containing -#' the learner identity, serialized spec, cross-validation scores (`crossval_score`), -#' and serialized held-out predictions (`crossval_predictions`). fit_partial_models <- function( self, learner, @@ -330,131 +350,151 @@ fit_partial_models <- function( as_trans_models_t() } -#' @describeIn trans_models_t Fit full models for each transition based on the best -#' partial model according to a specified cross-validation criterion. +#' @describeIn trans_models_t Fit full models (trained on the complete dataset) for each +#' viable transition and return a [trans_models_t] object with `learner_full` populated. +#' Two mutually exclusive modes are supported: +#' - **Direct-learner mode** (`learner` provided, `select_score` omitted): a fresh clone of +#' `learner` is trained on the full data for each transition. `crossval_score` and +#' `crossval_predictions` will be `NULL` in the result. Does not require a prior +#' call to [fit_partial_models()]. +#' - **Score-select mode** (`select_score` provided, `learner` omitted): selects the best +#' partial model per transition by `select_score`, reconstructs its learner from +#' `learner_spec`, and retrains on the full data. Requires [fit_partial_models()] to +#' have been run first. #' @param self [evoland_db] instance to query for transitions and predictor data -#' @param learner An mlr3 `Learner` or `AutoTuner` object; kept for API consistency and -#' used as a last-resort fallback if both `learner_spec` deserialization and -#' `do.call(mlr3::lrn, ...)` reconstruction fail. -#' @param measures Either a character vector of mlr3 measure IDs or a list of `Measure` -#' objects; kept for API consistency. -#' @param gof_criterion Character string specifying which cross-validation score to use -#' for selecting the best partial model per transition (must match a key in -#' `crossval_score`, e.g. `"classif.auc"`). -#' @param gof_maximize Logical; select the model with the maximum (`TRUE`) or minimum -#' (`FALSE`) value of `gof_criterion`. Default is `TRUE`. +#' @param learner An mlr3 `Learner` or `AutoTuner` object for direct-learner mode. +#' Must be `NULL` when `select_score` is provided. +#' @param select_score Character string; mlr3 measure ID (e.g. `"classif.auc"`) used to +#' rank partial models in score-select mode. Must be `NULL` when `learner` is provided. +#' @param select_maximize Logical; if `TRUE` (default) the model with the highest +#' `select_score` is selected; if `FALSE`, the lowest. Only used in score-select mode. #' @param cluster An optional cluster object created by [parallel::makeCluster()] or #' [mirai::make_cluster()]. -#' @return A [trans_models_t] table with one row per transition, containing the columns -#' from the best partial model plus `learner_full` with the serialized fully-trained -#' learner. fit_full_models <- function( self, - learner, - measures, - gof_criterion, - gof_maximize, + learner = NULL, + select_score = NULL, + select_maximize = TRUE, cluster = NULL ) { + has_learner <- !is.null(learner) + has_score <- !is.null(select_score) + stopifnot( - "gof_criterion must be a character string" = is.character(gof_criterion) && - length(gof_criterion) == 1L, - "gof_maximize must be set to TRUE or FALSE" = isTRUE(gof_maximize) || isFALSE(gof_maximize), - "trans_models_t is missing" = file.exists(self$get_table_path("trans_models_t")) + "Provide exactly one of 'learner' or 'select_score'" = xor(has_learner, has_score) ) - # Get the best partial model per transition (scalar columns only; MAP/BLOB via fetch below) - best_model_ids <- self$get_query(glue::glue( - r"[ - with preds_nested as ( - select - id_run, - id_trans, - list(id_pred) as id_pred - from - {self$get_read_expr("trans_preds_t")} - group by - id_run, id_trans + if (has_learner) { + stopifnot( + "learner must be an mlr3 Learner or AutoTuner" = inherits(learner, "Learner") ) - select - tm.id_run, - tm.id_trans, - tm.learner_id, - pn.id_pred, - from - {self$get_read_expr("trans_models_t")} tm, - preds_nested pn - where - pn.id_run = tm.id_run - and pn.id_trans = tm.id_trans - qualify row_number() over ( - partition by tm.id_run, tm.id_trans - order by tm.crossval_score['{gof_criterion}'] {ifelse(gof_maximize, "desc", "asc")} - ) = 1; - ]" - )) - # Fetch learner_spec (BLOB) and learner_params (MAP) for the best rows via fetch() - # so that MAP columns are properly deserialized to named lists. - learner_id_csv <- paste0("'", best_model_ids$learner_id, "'", collapse = ", ") - best_specs <- self$fetch( - "trans_models_t", - cols = c("id_run", "id_trans", "learner_id", "learner_spec", "learner_params"), - where = glue::glue( - "id_run in ({toString(best_model_ids$id_run)}) and ", - "id_trans in ({toString(best_model_ids$id_trans)}) and ", - "learner_id in ({learner_id_csv})" + # Direct mode: get viable transitions with their predictor lists + trans_preds_nested <- + data.table::as.data.table(self$trans_preds_t)[, + .(id_pred = list(id_pred)), + by = .(id_run, id_trans) + ] + + viable_trans <- + self$trans_meta_t[ + is_viable == TRUE, + .(id_trans) + ][ + trans_preds_nested, + on = "id_trans" + ] + + message(glue::glue( + "Fitting full models for {nrow(viable_trans)} transitions..." + )) + + viable_trans |> + split(by = c("id_run", "id_trans")) |> + run_parallel_evoland( + items = _, + worker_fun = fit_full_model_worker, + parent_db = self, + cluster = cluster, + learner = learner, + ) |> + data.table::rbindlist() |> + as_trans_models_t() + } else { + # Score-select mode + stopifnot( + "select_score must be a character string" = is.character(select_score) && + length(select_score) == 1L, + "select_maximize must be TRUE or FALSE" = isTRUE(select_maximize) || isFALSE(select_maximize), + "trans_models_t is missing" = file.exists(self$get_table_path("trans_models_t")) ) - ) - # Join to add id_pred and build complete item list for workers - best_models <- best_model_ids[best_specs, on = c("id_run", "id_trans", "learner_id")] + # Identify the best partial model per transition (using QUALIFY window function) + # and get the predictor ID lists from trans_preds_t in the same query + best_model_ids <- self$get_query(glue::glue( + r"[ + with preds_nested as ( + select + id_run, + id_trans, + list(id_pred) as id_pred + from + {self$get_read_expr("trans_preds_t")} + group by + id_run, id_trans + ) + select + tm.id_run, + tm.id_trans, + tm.learner_id, + pn.id_pred, + from + {self$get_read_expr("trans_models_t")} tm, + preds_nested pn + where + pn.id_run = tm.id_run + and pn.id_trans = tm.id_trans + qualify row_number() over ( + partition by tm.id_run, tm.id_trans + order by tm.crossval_score['{select_score}'] {ifelse(select_maximize, "desc", "asc")} + ) = 1; + ]" + )) - message(glue::glue( - "Fitting full models for {nrow(best_models)} transitions..." - )) + # Fetch all trans_models_t columns (including MAP/BLOB) for the best rows in one shot + learner_id_csv <- paste0("'", best_model_ids$learner_id, "'", collapse = ", ") + best_specs <- self$fetch( + "trans_models_t", + cols = c( + "id_run", "id_trans", "learner_id", + "learner_spec", "learner_params", + "crossval_score", "crossval_predictions" + ), + where = glue::glue( + "id_run in ({toString(best_model_ids$id_run)}) and ", + "id_trans in ({toString(best_model_ids$id_trans)}) and ", + "learner_id in ({learner_id_csv})" + ) + ) - full_models <- - best_models |> - split(by = c("id_run", "id_trans")) |> - run_parallel_evoland( - items = _, - worker_fun = fit_full_model_worker, - parent_db = self, - cluster = cluster, - ) |> - data.table::rbindlist() + # Join to add id_pred from the ranking query + best_models <- best_model_ids[best_specs, on = c("id_run", "id_trans", "learner_id")] - # Fetch remaining columns from the best partial models and join - partial_models <- self$fetch( - "trans_models_t", - cols = c( - "id_run", - "id_trans", - "learner_id", - "learner_params", - "learner_spec", - "crossval_score", - "crossval_predictions" - ), - where = glue::glue( - "id_run in ({toString(full_models$id_run)}) and ", - "id_trans in ({toString(full_models$id_trans)}) and ", - "learner_id in ({paste0(\"'\", full_models$learner_id, \"'\", collapse = \", \")})" - ) - ) + message(glue::glue( + "Fitting full models for {nrow(best_models)} transitions..." + )) - full_models[ - partial_models, - on = c("id_run", "id_trans", "learner_id"), - `:=`( - learner_params = i.learner_params, - learner_spec = i.learner_spec, - crossval_score = i.crossval_score, - crossval_predictions = i.crossval_predictions - ) - ] |> - as_trans_models_t() + best_models |> + split(by = c("id_run", "id_trans")) |> + run_parallel_evoland( + items = _, + worker_fun = fit_full_model_worker, + parent_db = self, + cluster = cluster, + ) |> + data.table::rbindlist() |> + as_trans_models_t() + } } diff --git a/inst/tinytest/test_integ_allocation.R b/inst/tinytest/test_integ_allocation.R index 0dd1429..2594ab7 100644 --- a/inst/tinytest/test_integ_allocation.R +++ b/inst/tinytest/test_integ_allocation.R @@ -5,10 +5,6 @@ if (!at_home()) { exit_file("Integration tests skipped (not at_home)") } -if (!requireNamespace("mlr3", quietly = TRUE)) { - exit_file("mlr3 not available; skipping allocation integration tests") -} - source(file.path(system.file("tinytest", package = "evoland"), "helper_testdb.R")) db <- make_test_db() db$trans_rates_t <- db$get_obs_trans_rates() @@ -19,7 +15,7 @@ db$trans_rates_t <- extrapolate_trans_rates( ) test_learner <- mlr3::lrn("classif.featureless", predict_type = "prob") -test_measures <- list(mlr3::msr("classif.auc")) +test_measures <- c("classif.auc") # test the package's featureless learner fit and append to disk expect_message( @@ -30,15 +26,31 @@ expect_message( ), "Fitting partial models for 2 transitions..." ) + +# Score-select mode: pick best partial model by classif.auc and retrain on full data expect_message( db$trans_models_t <- db$fit_full_models( - learner = test_learner, - measures = test_measures, - gof_criterion = "classif.auc", - gof_maximize = TRUE + select_score = "classif.auc", + select_maximize = TRUE + ), + "Fitting full models for" +) + +# Direct-learner mode: train a fresh learner on full data without cross-validation +db_direct <- make_test_db() +db_direct$trans_rates_t <- db_direct$get_obs_trans_rates() +db_direct$trans_rates_t <- extrapolate_trans_rates( + db_direct$trans_rates_t, + db_direct$periods_t, + coord_count = nrow(db_direct$coords_t) +) +expect_message( + full_direct <- db_direct$fit_full_models( + learner = test_learner ), "Fitting full models for" ) +expect_true(all(vapply(full_direct$learner_full, is.raw, logical(1)))) # no data for period 4 yet expect_equal(nrow(db$fetch("lulc_data_t", where = "id_period = 4")), 0L) diff --git a/inst/tinytest/test_integ_trans_models_t.R b/inst/tinytest/test_integ_trans_models_t.R index 5600065..53b3935 100644 --- a/inst/tinytest/test_integ_trans_models_t.R +++ b/inst/tinytest/test_integ_trans_models_t.R @@ -32,10 +32,6 @@ if (!at_home()) { exit_file("Integration tests skipped (not at_home)") } -if (!requireNamespace("mlr3", quietly = TRUE)) { - exit_file("mlr3 not available; skipping integration tests") -} - # Load fixtures via helper source(file.path( system.file("tinytest", package = "evoland"), @@ -101,13 +97,11 @@ expect_true(all(vapply( logical(1) ))) -# Test fit_full_models, which reconstructs and retrains on full data +# Test fit_full_models in score-select mode (picks best partial model by crossval_score) expect_message( db$trans_models_t <- full_models <- db$fit_full_models( - learner = test_learner, - measures = test_measures, - gof_criterion = "classif.auc", - gof_maximize = TRUE + select_score = "classif.auc", + select_maximize = TRUE ), "Fitting full models for" ) @@ -133,10 +127,8 @@ expect_false(is.null(deserialized_full$model)) # Test model selection with minimize criterion expect_message( full_models_min <- db$fit_full_models( - learner = test_learner, - measures = test_measures, - gof_criterion = "classif.acc", - gof_maximize = FALSE + select_score = "classif.acc", + select_maximize = FALSE ), "Fitting full models for" ) @@ -178,14 +170,12 @@ expect_error( "sample_frac must be between 0 and 1" ) -# Test error handling - missing trans_models_t for full model fitting +# Test error handling - missing trans_models_t for full model fitting (score-select mode) db$delete_from("trans_models_t") expect_error( db$fit_full_models( - learner = test_learner, - measures = test_measures, - gof_criterion = "classif.auc", - gof_maximize = TRUE + select_score = "classif.auc", + select_maximize = TRUE ), "trans_models_t is missing" ) @@ -217,7 +207,7 @@ expect_equal( "error" ) -# Test print method +# Test print method (score-select mode) db3 <- make_test_db(include_neighbors = FALSE, include_trans_preds = TRUE) db3$trans_models_t <- db3$fit_partial_models( learner = test_learner, @@ -225,12 +215,52 @@ db3$trans_models_t <- db3$fit_partial_models( seed = 42 ) db3$trans_models_t <- db3$fit_full_models( - learner = test_learner, - measures = test_measures, - gof_criterion = "classif.auc", - gof_maximize = TRUE + select_score = "classif.auc", + select_maximize = TRUE ) expect_stdout( print(db3$trans_models_t), "Transition Models Table|Total models" ) + +# Test direct-learner mode: fit_full_models with a learner (no partial models needed) +db4 <- make_test_db(include_neighbors = FALSE, include_trans_preds = TRUE) +expect_message( + full_models_direct <- db4$fit_full_models( + learner = test_learner + ), + "Fitting full models for" +) +# direct mode: crossval_score and crossval_predictions should be NULL +expect_true(all(vapply(full_models_direct$crossval_score, is.null, logical(1)))) +expect_true(all(vapply(full_models_direct$crossval_predictions, is.null, logical(1)))) +# learner_full should be populated +expect_true(all(vapply(full_models_direct$learner_full, is.raw, logical(1)))) +deserialized_direct <- qs2::qs_deserialize(full_models_direct$learner_full[[1]]) +expect_true(inherits(deserialized_direct, "Learner")) +expect_false(is.null(deserialized_direct$model)) + +# Test get_crossval_plots (requires mlr3viz) +if (!requireNamespace("mlr3viz", quietly = TRUE)) { + exit_file("mlr3viz not available; skipping get_crossval_plots tests") +} + +db5 <- make_test_db(include_neighbors = FALSE, include_trans_preds = TRUE) +db5$trans_models_t <- db5$fit_partial_models( + learner = test_learner, + measures = test_measures, + seed = 42 +) + +plots <- db5$get_crossval_plots() +expect_true(is.list(plots)) +expect_equal(length(plots), nrow(db5$trans_models_t)) +# Each element should be a ggplot (or NULL for rows without predictions) +non_null_plots <- Filter(Negate(is.null), plots) +expect_true(length(non_null_plots) > 0L) +expect_true(all(vapply(non_null_plots, inherits, logical(1), "gg"))) + +# Filter by id_trans +plots_filtered <- db5$get_crossval_plots(id_trans = db5$trans_models_t$id_trans[[1]]) +expect_true(is.list(plots_filtered)) +expect_equal(length(plots_filtered), 1L) From ee71350cc8b0ccf1cc6c99689eb285943e038b1f Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Fri, 24 Apr 2026 16:53:22 +0200 Subject: [PATCH 10/15] better mlr3 tests; caught warning-inducing edge cases --- R/parquet_db.R | 3 + R/parquet_db_utils.R | 21 ++-- R/trans_models_t.R | 20 ++-- inst/tinytest/test_integ_allocation.R | 16 --- inst/tinytest/test_integ_trans_models_t.R | 132 +++++++++------------- man/evoland_db.Rd | 52 ++++++--- man/parquet_db_utils.Rd | 3 +- man/trans_models_t.Rd | 90 ++++++++++----- 8 files changed, 179 insertions(+), 158 deletions(-) diff --git a/R/parquet_db.R b/R/parquet_db.R index ceae80b..bc33530 100644 --- a/R/parquet_db.R +++ b/R/parquet_db.R @@ -149,6 +149,9 @@ parquet_db <- R6::R6Class( metadata <- private$read_parquet_metadata(table_path) map_cols <- resolve_cols(NULL, metadata, "map_cols") + if (!is.null(cols)) { + map_cols <- intersect(cols, map_cols) + } read_expr <- self$get_read_expr(table_name) # build sql query diff --git a/R/parquet_db_utils.R b/R/parquet_db_utils.R index 63d3306..6d367dc 100644 --- a/R/parquet_db_utils.R +++ b/R/parquet_db_utils.R @@ -104,13 +104,17 @@ validate.parquet_db_t <- function(x, ...) { } } else if (col %in% attr(x, "map_cols")) { for (val in x[[col]]) { - if ( - !is.null(val) && # if not NULL any of the following being true is an error - (!is.list(val) || # if it's a list - is.null(names(val)) || # or names missing - any(vapply(val, Negate(is.atomic), logical(1)))) # or any element not atomic - ) { - # then throw error + is_valid_mapcol <- function(v) { + # allow NULLs + is.null(v) || + # allow empty lists + (is.list(v) && length(v) == 0) || + # allow named lists with atomic values + (is.list(v) && !is.null(names(v)) && all(vapply(v, is.atomic, logical(1)))) + } + + # negation makes the above condition easier to read + if (Negate(is_valid_mapcol)(val)) { stop(glue::glue( "Column '{col}' specified as map_cols must be a list of ", "named lists with atomic values" @@ -304,7 +308,8 @@ kv_df_to_list <- function(x) { #' (missing argument), or upserts to it (assignment operation) #' @param table_name The name of the table to bind to. #' @param mode The mode of the binding, which determines the behavior when -#' committing data. Options are: "write_once" (default, only allows writing if table doesn't exist), "upsert" +#' committing data. Options are: "write_once" (default, only allows writing if +#' table doesn't exist), "upsert", "append", and "overwrite". create_table_binding <- function( table_name, mode = c("write_once", "upsert", "append", "overwrite") diff --git a/R/trans_models_t.R b/R/trans_models_t.R index b42421e..3e62451 100644 --- a/R/trans_models_t.R +++ b/R/trans_models_t.R @@ -164,9 +164,9 @@ fit_partial_model_worker <- function( id_run = item[["id_run"]], id_trans = item[["id_trans"]], learner_id = "error", - learner_params = list(NULL), + learner_params = list(list()), learner_spec = list(NULL), - crossval_score = list(NULL), + crossval_score = list(list()), crossval_predictions = list(NULL), learner_full = list(NULL) ) @@ -215,7 +215,7 @@ fit_full_model_worker <- function(item, db, learner = NULL, ...) { ) learner_params_val <- if (length(learner_params_val) == 0L) NULL else learner_params_val learner_spec_blob <- qs2::qs_serialize(trained_learner$clone(deep = TRUE)$reset()) - crossval_score_val <- list(NULL) + crossval_score_val <- list(list()) crossval_predictions_val <- list(NULL) } else { # Score-select mode: reconstruct from learner_spec; fall back to do.call @@ -260,9 +260,9 @@ fit_full_model_worker <- function(item, db, learner = NULL, ...) { id_run = item[["id_run"]], id_trans = item[["id_trans"]], learner_id = if (!is.null(learner)) learner$id else item[["learner_id"]], - learner_params = list(NULL), + learner_params = list(list()), learner_spec = list(NULL), - crossval_score = list(NULL), + crossval_score = list(list()), crossval_predictions = list(NULL), learner_full = list(NULL) ) @@ -466,9 +466,13 @@ fit_full_models <- function( best_specs <- self$fetch( "trans_models_t", cols = c( - "id_run", "id_trans", "learner_id", - "learner_spec", "learner_params", - "crossval_score", "crossval_predictions" + "id_run", + "id_trans", + "learner_id", + "learner_spec", + "learner_params", + "crossval_score", + "crossval_predictions" ), where = glue::glue( "id_run in ({toString(best_model_ids$id_run)}) and ", diff --git a/inst/tinytest/test_integ_allocation.R b/inst/tinytest/test_integ_allocation.R index 2594ab7..ff9925a 100644 --- a/inst/tinytest/test_integ_allocation.R +++ b/inst/tinytest/test_integ_allocation.R @@ -36,22 +36,6 @@ expect_message( "Fitting full models for" ) -# Direct-learner mode: train a fresh learner on full data without cross-validation -db_direct <- make_test_db() -db_direct$trans_rates_t <- db_direct$get_obs_trans_rates() -db_direct$trans_rates_t <- extrapolate_trans_rates( - db_direct$trans_rates_t, - db_direct$periods_t, - coord_count = nrow(db_direct$coords_t) -) -expect_message( - full_direct <- db_direct$fit_full_models( - learner = test_learner - ), - "Fitting full models for" -) -expect_true(all(vapply(full_direct$learner_full, is.raw, logical(1)))) - # no data for period 4 yet expect_equal(nrow(db$fetch("lulc_data_t", where = "id_period = 4")), 0L) diff --git a/inst/tinytest/test_integ_trans_models_t.R b/inst/tinytest/test_integ_trans_models_t.R index 53b3935..032bd90 100644 --- a/inst/tinytest/test_integ_trans_models_t.R +++ b/inst/tinytest/test_integ_trans_models_t.R @@ -43,7 +43,6 @@ db <- make_test_db(include_neighbors = FALSE, include_trans_preds = TRUE) test_learner <- mlr3::lrn("classif.featureless", predict_type = "prob") # measures can be passed as a character vector of IDs (convenience) or as a list of Measure objects test_measures <- c("classif.auc", "classif.acc") - # Test fit_partial_models expect_message( db$trans_models_t <- partial_models <- db$fit_partial_models( @@ -54,27 +53,14 @@ expect_message( ), "Fitting partial models for 2 transitions..." ) -expect_length( - partial_models, - 8L # columns -) -expect_true(all( - c( - "id_run", - "id_trans", - "learner_id", - "learner_params", - "learner_spec", - "crossval_score", - "crossval_predictions", - "learner_full" - ) %in% - names(partial_models) -)) expect_equal( - unique(partial_models$learner_id), - "classif.featureless" + partial_models[["crossval_score"]], + list( + list(classif.auc = 0.5, classif.acc = 0.547945205479452), + list(classif.auc = 0.5, classif.acc = 0.536050156739812) + ) ) +expect_equal(partial_models$learner_id[1], "classif.featureless") expect_true(all( vapply(partial_models$learner_spec, is.raw, logical(1)) )) @@ -90,13 +76,6 @@ deserialized_spec <- qs2::qs_deserialize(partial_models$learner_spec[[1]]) expect_true(inherits(deserialized_spec, "Learner")) expect_equal(deserialized_spec$id, "classif.featureless") -# crossval_score should be named lists with measure IDs as keys -expect_true(all(vapply( - partial_models$crossval_score, - function(m) !is.null(m) && is.list(m) && "classif.auc" %in% names(m), - logical(1) -))) - # Test fit_full_models in score-select mode (picks best partial model by crossval_score) expect_message( db$trans_models_t <- full_models <- db$fit_full_models( @@ -180,13 +159,8 @@ expect_error( "trans_models_t is missing" ) -# Test fit function that throws an error (simulate via a bad learner) -# We achieve this by running with a learner that fails (no viable data after -# deleting the table above means we re-populate and then use a bad setup). -# Instead, test via a mock that warns on error using an out-of-range sample_frac -# -> use a real error scenario by removing all preds -db2 <- make_test_db(include_neighbors = FALSE, include_trans_preds = FALSE) -db2$trans_preds_t <- as_trans_preds_t(data.table::data.table( +# Test fit function that throws an error: overwrite trans_preds_t +db$trans_preds_t <- as_trans_preds_t(data.table::data.table( id_run = 0L, id_pred = 99999L, # non-existent predictor id_trans = 1L @@ -194,73 +168,73 @@ db2$trans_preds_t <- as_trans_preds_t(data.table::data.table( expect_warning( partial_models_error <- - db2$fit_partial_models( + db$fit_partial_models( learner = test_learner, measures = test_measures, - sample_frac = 0.7, - seed = 123 + sample_frac = 0.7 ), "No predictor columns|No data" ) -expect_equal( - partial_models_error$learner_id, - "error" -) +expect_equal(partial_models_error$learner_id, "error") -# Test print method (score-select mode) -db3 <- make_test_db(include_neighbors = FALSE, include_trans_preds = TRUE) -db3$trans_models_t <- db3$fit_partial_models( - learner = test_learner, - measures = test_measures, - seed = 42 -) -db3$trans_models_t <- db3$fit_full_models( - select_score = "classif.auc", - select_maximize = TRUE -) -expect_stdout( - print(db3$trans_models_t), - "Transition Models Table|Total models" +# Test direct-learner mode: fit_full_models with a learner +db$set_full_trans_preds() +expect_message( + db$trans_models_t <- full_models_direct <- db$fit_full_models(learner = test_learner), + "Fitting full models for" ) +# direct mode: crossval_score and crossval_predictions should be length 0 +expect_true(all(vapply(full_models_direct$crossval_score, length, integer(1)) == 0L)) +expect_true(all(vapply(full_models_direct$crossval_predictions, length, integer(1)) == 0L)) +# learner_full should be populated +expect_true(all(vapply(full_models_direct$learner_full, is.raw, logical(1)))) +deserialized_direct <- qs2::qs_deserialize(full_models_direct$learner_full[[1]])$reset() +expect_equal(deserialized_direct, test_learner) -# Test direct-learner mode: fit_full_models with a learner (no partial models needed) -db4 <- make_test_db(include_neighbors = FALSE, include_trans_preds = TRUE) expect_message( - full_models_direct <- db4$fit_full_models( - learner = test_learner - ), + db$trans_models_t <- full_models_direct <- db$fit_full_models(learner = test_learner), "Fitting full models for" ) -# direct mode: crossval_score and crossval_predictions should be NULL -expect_true(all(vapply(full_models_direct$crossval_score, is.null, logical(1)))) -expect_true(all(vapply(full_models_direct$crossval_predictions, is.null, logical(1)))) +# direct mode: crossval_score and crossval_predictions should be length 0 +expect_true(all(vapply(full_models_direct$crossval_score, length, integer(1)) == 0L)) +expect_true(all(vapply(full_models_direct$crossval_predictions, length, integer(1)) == 0L)) # learner_full should be populated expect_true(all(vapply(full_models_direct$learner_full, is.raw, logical(1)))) -deserialized_direct <- qs2::qs_deserialize(full_models_direct$learner_full[[1]]) -expect_true(inherits(deserialized_direct, "Learner")) -expect_false(is.null(deserialized_direct$model)) +deserialized_direct <- qs2::qs_deserialize(full_models_direct$learner_full[[1]])$reset() +expect_equal(deserialized_direct, test_learner) # Test get_crossval_plots (requires mlr3viz) if (!requireNamespace("mlr3viz", quietly = TRUE)) { exit_file("mlr3viz not available; skipping get_crossval_plots tests") } -db5 <- make_test_db(include_neighbors = FALSE, include_trans_preds = TRUE) -db5$trans_models_t <- db5$fit_partial_models( - learner = test_learner, - measures = test_measures, - seed = 42 +expect_message( + db$trans_models_t <- db$fit_partial_models( + learner = test_learner, + measures = test_measures, + seed = 42 + ), + "Fitting partial models for 2 transitions..." ) -plots <- db5$get_crossval_plots() +plots <- db$get_crossval_plots() expect_true(is.list(plots)) -expect_equal(length(plots), nrow(db5$trans_models_t)) -# Each element should be a ggplot (or NULL for rows without predictions) -non_null_plots <- Filter(Negate(is.null), plots) -expect_true(length(non_null_plots) > 0L) -expect_true(all(vapply(non_null_plots, inherits, logical(1), "gg"))) +expect_equal(length(plots), nrow(db$trans_models_t)) +expect_true(all(vapply(plots, inherits, logical(1), "gg"))) # Filter by id_trans -plots_filtered <- db5$get_crossval_plots(id_trans = db5$trans_models_t$id_trans[[1]]) -expect_true(is.list(plots_filtered)) +plots_filtered <- db$get_crossval_plots(id_trans = 1) expect_equal(length(plots_filtered), 1L) +plot_trans_1 <- plots_filtered[[1]] +expect_true(inherits(plot_trans_1, "gg")) +expect_equal( + plot_trans_1$data |> summary() |> as.vector(), + c( + "truth :219 ", + "response:219 ", + NA, + "Length:438 ", + "Class :character ", + "Mode :character " + ) +) diff --git a/man/evoland_db.Rd b/man/evoland_db.Rd index caa03a2..3d72f4d 100644 --- a/man/evoland_db.Rd +++ b/man/evoland_db.Rd @@ -80,6 +80,7 @@ Additional methods and active bindings are added to this class in separate files \item \href{#method-evoland_db-lulc_data_as_rast}{\code{evoland_db$lulc_data_as_rast()}} \item \href{#method-evoland_db-fit_full_models}{\code{evoland_db$fit_full_models()}} \item \href{#method-evoland_db-fit_partial_models}{\code{evoland_db$fit_partial_models()}} +\item \href{#method-evoland_db-get_crossval_plots}{\code{evoland_db$get_crossval_plots()}} \item \href{#method-evoland_db-set_full_trans_preds}{\code{evoland_db$set_full_trans_preds()}} \item \href{#method-evoland_db-get_pruned_trans_preds_t}{\code{evoland_db$get_pruned_trans_preds_t()}} \item \href{#method-evoland_db-predict_trans_pot}{\code{evoland_db$predict_trans_pot()}} @@ -401,13 +402,14 @@ NULL (default), all periods are included.} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-evoland_db-fit_full_models}{}}} \subsection{Method \code{fit_full_models()}}{ -Fit full models on complete data using the best partial model configuration for -each transition, see \code{\link[=fit_full_models]{fit_full_models()}} +Fit full models (trained on the complete dataset) for each viable transition, +see \code{\link[=fit_full_models]{fit_full_models()}}. Two mutually exclusive modes: pass \code{learner} to train +directly, or pass \code{select_score} to pick the best partial model by score. \subsection{Usage}{ \if{html}{\out{