From 4bcac1c1e85200f64fa41b9dbf0c55e4c819e482 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 16 Apr 2026 13:01:47 +0000 Subject: [PATCH 01/15] Add mlr3filters TODO to trans_preds_t Agent-Logs-Url: https://github.com/ethzplus/evoland-plus/sessions/2eb1143b-0b8d-42f2-ad27-052b4100baea Co-authored-by: mmyrte <24587121+mmyrte@users.noreply.github.com> --- R/trans_preds_t.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/R/trans_preds_t.R b/R/trans_preds_t.R index 821e527..7c6b4fe 100644 --- a/R/trans_preds_t.R +++ b/R/trans_preds_t.R @@ -169,6 +169,13 @@ prune_trans_worker <- function(item, db, filter_fun, ordered_pred_data = FALSE, ) } +# TODO: mlr3filters (https://mlr3filters.mlr-org.com/) provides a range of filter methods +# (mutual information, permutation importance, correlation-based, etc.) that could replace or +# supplement the current covariance_filter approach here. This refactoring should follow the +# same patterns as trans_models_t: accept an mlr3 Filter object (or id string + params) in +# place of filter_fun, store the filter id and parameters as DuckDB-native MAP columns, and +# serialize the filter object as a BLOB for full reproducibility. + #' @describeIn trans_preds_t Get a pruned set of transition-predictor relationships #' based on a filtering function #' @param filter_fun A function that takes a transition-predictor data (cf. [trans_pred_data_v]) and From b649c94b40c8ab8ca0cd5908980fa5b9c17d36cb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 16 Apr 2026 13:36:27 +0000 Subject: [PATCH 02/15] Refactor trans_models_t to use mlr3 interface Agent-Logs-Url: https://github.com/ethzplus/evoland-plus/sessions/16192889-e909-4511-bc6c-4fc80926b365 Co-authored-by: mmyrte <24587121+mmyrte@users.noreply.github.com> --- DESCRIPTION | 2 + R/evoland_db.R | 29 +- R/trans_models_t.R | 386 +++++++++++++--------- R/trans_pot_t.R | 37 +-- inst/tinytest/test_integ_allocation.R | 18 +- inst/tinytest/test_integ_trans_models_t.R | 281 ++++++---------- 6 files changed, 388 insertions(+), 365 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 322f390..fac40ea 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -26,6 +26,8 @@ Imports: Suggests: base64enc, butcher, + mlr3, + mlr3viz, pROC, processx, quarto, diff --git a/R/evoland_db.R b/R/evoland_db.R index a29edb7..ba0064d 100644 --- a/R/evoland_db.R +++ b/R/evoland_db.R @@ -173,12 +173,15 @@ evoland_db <- R6::R6Class( #' @description #' Fit full models on complete data using the best partial model configuration for #' each transition, see [fit_full_models()] - #' @param partial_models A trans_models_t table with partial models (see [fit_partial_models()]) - #' @param gof_criterion Which goodness-of-fit metric to use for model selection (e.g., "auc") + #' @param learner An mlr3 `Learner` or `AutoTuner` object; used as last-resort + #' fallback for reconstruction. + #' @param measures A list of mlr3 `Measure` objects; kept for API consistency. + #' @param gof_criterion Which cross-validation measure to use for model selection (e.g., `"classif.auc"`) #' @param gof_maximize Maximize (TRUE) or minimize (FALSE) the gof_criterion? #' @param cluster Optional cluster object for parallel processing fit_full_models = function( - partial_models, + learner, + measures, gof_criterion, gof_maximize, cluster = NULL @@ -189,23 +192,29 @@ evoland_db <- R6::R6Class( #' @description Fit partial models for each viable transition using stratified #' sampling. Models are trained on a subsample and evaluated on held-out data, see #' [fit_partial_models()] for details. - #' @param fit_fun Function for generating a model object. - #' @param gof_fun Function to evaluate goodness of fit. + #' @param learner An mlr3 `Learner` or `AutoTuner` R6 object. + #' @param measures A list of mlr3 `Measure` objects for scoring the held-out split. #' @param sample_frac Fraction in \(0, 1\) for stratified sampling. #' @param seed Random seed for reproducible sampling #' @param cluster Optional cluster object for parallel processing - #' @param ... additional arguments passed to fit_fun fit_partial_models = function( - fit_fun, + learner, + measures, sample_frac = 0.7, - gof_fun, seed = NULL, - cluster = NULL, - ... + cluster = NULL ) { create_method_binding(fit_partial_models) }, + #' @description + #' Get cross-validation plots for stored predictions, see [get_crossval_plots()] + #' @param id_run Optional integer; filter by run ID. + #' @param id_trans Optional integer; filter by transition ID. + get_crossval_plots = function(id_run = NULL, id_trans = NULL) { + create_method_binding(get_crossval_plots) + }, + #' @description #' Set an initial full set of transition / predictor relations, see [set_full_trans_preds()] #' @param overwrite Logical, whether to overwrite existing `trans_preds_t` table. Default FALSE. diff --git a/R/trans_models_t.R b/R/trans_models_t.R index 47bcd73..cdf6adf 100644 --- a/R/trans_models_t.R +++ b/R/trans_models_t.R @@ -2,7 +2,7 @@ #' #' Creates a trans_models_t table for storing transition model metadata and #' serialized model objects. This function creates an empty table with proper -#' structure for storing fitted models. +#' structure for storing fitted models via the mlr3 interface. #' #' @name trans_models_t #' @@ -11,24 +11,29 @@ #' @return A data.table of class "trans_models_t" with columns: #' - `id_run`: Foreign key to runs_t #' - `id_trans`: Foreign key to trans_meta_t -#' - `model_family`: Model family (e.g., "rf", "glm", "bayesian") -#' - `model_params`: Map of model (hyper) parameters -#' - `goodness_of_fit`: Map of various measures of fit (e.g., ROC AUC, RMSE) -#' - `fit_call`: Character string of the original fit function call for reproducibility -#' - `model_obj_part`: BLOB of serialized model object for validation -#' - `model_obj_full`: BLOB of serialized model object for extrapolation +#' - `learner_id`: mlr3 learner key, e.g. `"classif.ranger"` +#' - `learner_params`: MAP of atomic scalar learner hyperparameters for +#' querying; complete hyperparameters are captured by `learner_spec` +#' - `learner_spec`: BLOB of serialized untrained mlr3 `Learner`; for +#' AutoTuners, this is the optimal inner learner after tuning +#' - `crossval_measures`: MAP of cross-validation performance scores +#' (from `prediction$score(measures)`) +#' - `crossval_predictions`: BLOB of serialized mlr3 `PredictionClassif` +#' on the held-out test split +#' - `learner_full`: BLOB of serialized trained mlr3 `Learner` fitted on +#' the full dataset, used for extrapolation #' @export as_trans_models_t <- function(x) { if (missing(x)) { x <- data.table::data.table( id_run = integer(0), id_trans = integer(0), - model_family = character(0), - model_params = list(), - goodness_of_fit = list(), - fit_call = character(0), - model_obj_part = list(), - model_obj_full = list() + learner_id = character(0), + learner_params = list(), + learner_spec = list(), + crossval_measures = list(), + crossval_predictions = list(), + learner_full = list() ) } @@ -39,8 +44,8 @@ as_trans_models_t <- function(x) { as_parquet_db_t( x, "trans_models_t", - key_cols = c("id_run", "id_trans", "fit_call"), - map_cols = c("model_params", "goodness_of_fit"), + key_cols = c("id_run", "id_trans", "learner_id"), + map_cols = c("learner_params", "crossval_measures"), partition_cols = "id_run" ) } @@ -50,27 +55,19 @@ as_trans_models_t <- function(x) { fit_partial_model_worker <- function( item, db, - fit_fun, - gof_fun, + learner, + measures, seed = NULL, - sample_frac = 0.7, - ... + sample_frac = 0.7 ) { + if (!requireNamespace("mlr3", quietly = TRUE)) { + stop("Package 'mlr3' is required. Install with: install.packages('mlr3')") + } + id_run_orig <- db$id_run on.exit(db$id_run <- id_run_orig, add = TRUE) db$id_run <- item[["id_run"]] - # We modify the fit_fun by attaching the fit_fun_args to its formals. This allows - # us to deparse it so as to store a string representation. When calling the - # function object - possibly reconstructed using str2lang - only the data argument - # should change (subsampled/partial or full) - formals(fit_fun) <- c(formals(fit_fun), list(...)) - - # Deparse to character string for storage - fit_call_str <- - deparse(fit_fun, width.cutoff = 500L) |> - paste(collapse = "\n ") - tryCatch( { # Fetch ALL data into memory @@ -95,12 +92,10 @@ fit_partial_model_worker <- function( )) } - # Stratified sampling - # Split by did_transition (TRUE/FALSE) + # Stratified sampling by did_transition idx_true <- which(trans_pred_data_full[["did_transition"]]) idx_false <- which(!trans_pred_data_full[["did_transition"]]) - # Sample from each group n_train_true <- ceiling(length(idx_true) * sample_frac) n_train_false <- ceiling(length(idx_false) * sample_frac) @@ -113,58 +108,70 @@ fit_partial_model_worker <- function( sample(idx_false, n_train_false) ) - train_data <- trans_pred_data_full[train_idx] - test_data <- trans_pred_data_full[!train_idx] - - # actually evaluate the fit_fun - model <- fit_fun(data = train_data) + # Subset to task columns (did_transition + predictors) and coerce target + task_cols <- c("did_transition", pred_cols) + train_data <- trans_pred_data_full[train_idx, .SD, .SDcols = task_cols] + test_data <- trans_pred_data_full[-train_idx, .SD, .SDcols = task_cols] - # Evaluate on test data - goodness_of_fit <- gof_fun(model = model, test_data = test_data) + train_data[, did_transition := factor(did_transition, levels = c("FALSE", "TRUE"))] + test_data[, did_transition := factor(did_transition, levels = c("FALSE", "TRUE"))] - # Extract model family - model_family <- if (!is.null(attr(model, "family"))) { - as.character(attr(model, "family")) - } else if (inherits(model, "glm")) { - paste0("glm_", model$family$family) - } else { - class(model)[1] - } - - # Extract model params for subsetting - model_params <- list( - n_predictors = length(pred_cols), - n_train = nrow(train_data), - sample_frac = sample_frac, - ... + # Build mlr3 task and train a fresh clone of the learner + train_task <- mlr3::as_task_classif( + train_data, + target = "did_transition", + positive = "TRUE" + ) + trained_learner <- learner$clone(deep = TRUE) + trained_learner$train(train_task) + + # Predict on test data; test_data includes did_transition as truth + prediction <- trained_learner$predict_newdata(test_data) + + # Score with supplied measures + scores <- as.list(prediction$score(measures)) + + # For AutoTuner: extract optimal inner learner; otherwise use trained learner + extract_from <- + if (inherits(trained_learner, "AutoTuner") && !is.null(trained_learner$learner$model)) { + trained_learner$learner + } else { + trained_learner + } + + l_id <- extract_from$id + l_params <- Filter( + function(v) is.atomic(v) && length(v) == 1L, + extract_from$param_set$values ) + l_params <- if (length(l_params) == 0L) NULL else l_params + l_spec <- qs2::qs_serialize(extract_from$clone(deep = TRUE)$reset()) - # Create result row data.table::data.table( id_run = item[["id_run"]], id_trans = item[["id_trans"]], - model_family = model_family, - model_params = list(model_params), - goodness_of_fit = list(goodness_of_fit), - fit_call = fit_call_str, - model_obj_part = list(qs2::qs_serialize(model)), - model_obj_full = list(NULL) + learner_id = l_id, + learner_params = list(l_params), + learner_spec = list(l_spec), + crossval_measures = list(scores), + crossval_predictions = list(qs2::qs_serialize(prediction)), + learner_full = list(NULL) ) }, error = function(e) { warning(glue::glue( "Error fitting model for transition {item[['id_trans']]}: {e$message}" )) - return(data.table::data.table( + data.table::data.table( id_run = item[["id_run"]], id_trans = item[["id_trans"]], - model_family = "error", - model_params = list(NULL), - goodness_of_fit = list(list(failed = TRUE, message = e$message)), - fit_call = fit_call_str, - model_obj_part = list(NULL), - model_obj_full = list(NULL) - )) + learner_id = "error", + learner_params = list(NULL), + learner_spec = list(NULL), + crossval_measures = list(NULL), + crossval_predictions = list(NULL), + learner_full = list(NULL) + ) } ) } @@ -172,6 +179,10 @@ fit_partial_model_worker <- function( # Worker function for full model fitting # Not exported; used internally by fit_full_models fit_full_model_worker <- function(item, db, ...) { + if (!requireNamespace("mlr3", quietly = TRUE)) { + stop("Package 'mlr3' is required. Install with: install.packages('mlr3')") + } + tryCatch( { # Fetch full data @@ -186,52 +197,65 @@ fit_full_model_worker <- function(item, db, ...) { )) } - # Retrieve the fit call from best partial model (as character string) - fit_call_str <- item[["fit_call"]] + pred_cols <- grep("^id_pred_", names(trans_pred_data_full), value = TRUE) + task_cols <- c("did_transition", pred_cols) + task_data <- trans_pred_data_full[, .SD, .SDcols = task_cols] + task_data[, did_transition := factor(did_transition, levels = c("FALSE", "TRUE"))] + + full_task <- mlr3::as_task_classif( + task_data, + target = "did_transition", + positive = "TRUE" + ) - # Check that fit_call exists - if (is.na(fit_call_str) || fit_call_str == "") { - stop(glue::glue( - "fit_call not found for transition {item[['id_trans']]}" - )) - } + # Reconstruct learner: try learner_spec first, fall back to do.call(lrn, ...) + learner_spec_raw <- item[["learner_spec"]][[1L]] + l_id <- item[["learner_id"]] + l_params <- item[["learner_params"]][[1L]] + + trained_learner <- tryCatch( + qs2::qs_deserialize(learner_spec_raw), + error = function(e) { + warning(glue::glue( + "learner_spec deserialization failed for {l_id}: {e$message}; ", + "falling back to do.call reconstruction" + )) + do.call(mlr3::lrn, c(list(l_id), as.list(l_params))) + } + ) - # Parse the character string to call object, reconstruct function, call - fit_fun <- eval(str2lang(fit_call_str)) - model_full <- fit_fun(data = trans_pred_data_full) + trained_learner$train(full_task) - # Create result row - copy from partial model but update model_obj_full list( id_run = item[["id_run"]], id_trans = item[["id_trans"]], - fit_call = fit_call_str, - model_obj_full = list(qs2::qs_serialize(model_full)) + learner_id = l_id, + learner_full = list(qs2::qs_serialize(trained_learner)) ) }, error = function(e) { warning(glue::glue( "Error fitting full model for transition {item[['id_trans']]}: {e$message}" )) - return(list( + list( id_run = item[["id_run"]], id_trans = item[["id_trans"]], - fit_call = item[["fit_call"]], - model_obj_full = list(NULL) - )) + learner_id = item[["learner_id"]], + learner_full = list(NULL) + ) } ) } -#' @describeIn trans_models_t Fit partial models for each viable transition and store -#' results in a trans_models_t table. -#' @param self, [evoland_db] instance to query for transitions and predictor data -#' @param fit_fun Function that takes a data.frame with predictors and did_transition columns -#' and returns a fitted model object. The data argument is passed as the first argument -#' to the function, and additional arguments can be passed via ... -#' @param gof_fun Function that takes a fitted model object and a test data.frame and -#' returns a list of goodness-of-fit metrics. The model argument is passed as the first -#' argument and the test_data argument is passed as the second argument. +#' @describeIn trans_models_t Fit partial (cross-validation) models for each viable +#' transition and store results in a trans_models_t table. +#' @param self [evoland_db] instance to query for transitions and predictor data +#' @param learner An mlr3 `Learner` or `AutoTuner` object. A deep clone is trained +#' for each transition; the original object is not modified. For `AutoTuner`, +#' the optimal inner learner is extracted after tuning. +#' @param measures A list of mlr3 `Measure` objects (e.g. `list(mlr3::msr("classif.auc"))`) +#' used to score the held-out predictions; results are written to `crossval_measures`. #' @param sample_frac Numeric between 0 and 1 indicating #' the fraction of data to use for training the partial models. The rest is used for #' testing and calculating goodness-of-fit metrics. Default is 0.7 (70% training, 30% @@ -241,12 +265,11 @@ fit_full_model_worker <- function(item, db, ...) { #' [mirai::make_cluster()]. fit_partial_models <- function( self, - fit_fun, - gof_fun, + learner, + measures, sample_frac = 0.7, seed = NULL, - cluster = NULL, - ... + cluster = NULL ) { trans_preds_nested <- data.table::as.data.table(self$trans_preds_t)[, @@ -265,8 +288,8 @@ fit_partial_models <- function( stopifnot( "No viable transitions" = nrow(viable_trans) > 0L, - "fit_fun must be a function" = is.function(fit_fun), - "gof_fun must be a function" = is.function(gof_fun), + "learner must be an mlr3 Learner or AutoTuner" = inherits(learner, "Learner"), + "measures must be a non-empty list" = is.list(measures) && length(measures) > 0L, "sample_frac must be between 0 and 1" = sample_frac > 0 && sample_frac < 1 ) @@ -282,30 +305,33 @@ fit_partial_models <- function( worker_fun = fit_partial_model_worker, parent_db = self, cluster = cluster, - fit_fun = fit_fun, - gof_fun = gof_fun, + learner = learner, + measures = measures, seed = seed, - sample_frac = sample_frac, - ... + sample_frac = sample_frac ) |> data.table::rbindlist() |> as_trans_models_t() } #' @describeIn trans_models_t Fit full models for each transition based on the best -#' partial model according to a specified goodness-of-fit criterion. -#' @param self, [evoland_db] instance to query for transitions and predictor data -#' @param partial_models A trans_models_t table containing the fitted partial models and -#' their goodness-of-fit metrics. -#' @param gof_criterion Character string specifying which goodness-of-fit metric to use for -#' selecting the best partial model for each transition (e.g., "roc_auc", "rmse"). -#' @param gof_maximize Logical indicating whether to select the model with the maximum -#' (TRUE) or minimum (FALSE) value of the specified goodness-of-fit criterion. Default -#' is TRUE. +#' partial model according to a specified cross-validation criterion. +#' @param self [evoland_db] instance to query for transitions and predictor data +#' @param learner An mlr3 `Learner` or `AutoTuner` object; kept for API consistency and +#' used as a last-resort fallback if both `learner_spec` deserialization and +#' `do.call(mlr3::lrn, ...)` reconstruction fail. +#' @param measures A list of mlr3 `Measure` objects; kept for API consistency. +#' @param gof_criterion Character string specifying which cross-validation measure to use +#' for selecting the best partial model per transition (must match a key in +#' `crossval_measures`, e.g. `"classif.auc"`). +#' @param gof_maximize Logical; select the model with the maximum (`TRUE`) or minimum +#' (`FALSE`) value of `gof_criterion`. Default is `TRUE`. #' @param cluster An optional cluster object created by [parallel::makeCluster()] or #' [mirai::make_cluster()]. fit_full_models <- function( self, + learner, + measures, gof_criterion, gof_maximize, cluster = NULL @@ -313,11 +339,12 @@ fit_full_models <- function( stopifnot( "gof_criterion must be a character string" = is.character(gof_criterion) && length(gof_criterion) == 1L, - "gof_maximize must be a set to TRUE or FALSE" = (gof_maximize || !gof_maximize), + "gof_maximize must be set to TRUE or FALSE" = isTRUE(gof_maximize) || isFALSE(gof_maximize), "trans_models_t is missing" = file.exists(self$get_table_path("trans_models_t")) ) - best_models <- self$get_query(glue::glue( + # Get the best partial model per transition (scalar columns only; MAP/BLOB via fetch below) + best_model_ids <- self$get_query(glue::glue( r"[ with preds_nested as ( select @@ -332,7 +359,7 @@ fit_full_models <- function( select tm.id_run, tm.id_trans, - tm.fit_call, + tm.learner_id, pn.id_pred, from {self$get_read_expr("trans_models_t")} tm, @@ -342,11 +369,27 @@ fit_full_models <- function( and pn.id_trans = tm.id_trans qualify row_number() over ( partition by tm.id_run, tm.id_trans - order by tm.goodness_of_fit['{gof_criterion}'] {ifelse(gof_maximize, "desc", "asc")} + order by tm.crossval_measures['{gof_criterion}'] {ifelse(gof_maximize, "desc", "asc")} ) = 1; ]" )) + # Fetch learner_spec (BLOB) and learner_params (MAP) for the best rows via fetch() + # so that MAP columns are properly deserialized to named lists. + learner_id_csv <- paste0("'", best_model_ids$learner_id, "'", collapse = ", ") + best_specs <- self$fetch( + "trans_models_t", + cols = c("id_run", "id_trans", "learner_id", "learner_spec", "learner_params"), + where = glue::glue( + "id_run in ({toString(best_model_ids$id_run)}) and ", + "id_trans in ({toString(best_model_ids$id_trans)}) and ", + "learner_id in ({learner_id_csv})" + ) + ) + + # Join to add id_pred and build complete item list for workers + best_models <- best_model_ids[best_specs, on = c("id_run", "id_trans", "learner_id")] + message(glue::glue( "Fitting full models for {nrow(best_models)} transitions..." )) @@ -362,31 +405,33 @@ fit_full_models <- function( ) |> data.table::rbindlist() + # Fetch remaining columns from the best partial models and join partial_models <- self$fetch( "trans_models_t", cols = c( "id_run", "id_trans", - "fit_call", - "model_family", - "model_params", - "goodness_of_fit", - "model_obj_part" + "learner_id", + "learner_params", + "learner_spec", + "crossval_measures", + "crossval_predictions" ), where = glue::glue( "id_run in ({toString(full_models$id_run)}) and ", - "id_trans in ({toString(full_models$id_trans)})" + "id_trans in ({toString(full_models$id_trans)}) and ", + "learner_id in ({paste0(\"'\", full_models$learner_id, \"'\", collapse = \", \")})" ) ) full_models[ partial_models, - on = c("id_run", "id_trans", "fit_call"), + on = c("id_run", "id_trans", "learner_id"), `:=`( - model_family = i.model_family, - model_params = i.model_params, - goodness_of_fit = i.goodness_of_fit, - model_obj_part = i.model_obj_part + learner_params = i.learner_params, + learner_spec = i.learner_spec, + crossval_measures = i.crossval_measures, + crossval_predictions = i.crossval_predictions ) ] |> as_trans_models_t() @@ -402,12 +447,12 @@ validate.trans_models_t <- function(x, ...) { c( "id_run", "id_trans", - "model_family", - "model_params", - "goodness_of_fit", - "fit_call", - "model_obj_part", - "model_obj_full" + "learner_id", + "learner_params", + "learner_spec", + "crossval_measures", + "crossval_predictions", + "learner_full" ) ) @@ -419,14 +464,14 @@ validate.trans_models_t <- function(x, ...) { stopifnot( is.integer(x[["id_run"]]), is.integer(x[["id_trans"]]), - is.character(x[["model_family"]]), - is.list(x[["model_params"]]), - is.list(x[["goodness_of_fit"]]), - is.character(x[["fit_call"]]), - is.list(x[["model_obj_part"]]), - is.list(x[["model_obj_full"]]), + is.character(x[["learner_id"]]), + is.list(x[["learner_params"]]), + is.list(x[["learner_spec"]]), + is.list(x[["crossval_measures"]]), + is.list(x[["crossval_predictions"]]), + is.list(x[["learner_full"]]), all(x[["id_trans"]] > 0), - !any(x[["model_family"]] == "") + !any(x[["learner_id"]] == "") ) return(x) @@ -437,16 +482,16 @@ validate.trans_models_t <- function(x, ...) { print.trans_models_t <- function(x) { if (nrow(x) > 0) { n_trans <- data.table::uniqueN(x[["id_trans"]]) - model_families <- unique(x[["model_family"]]) - n_with_part_models <- sum(!vapply(x[["model_obj_part"]], is.null, logical(1))) - n_with_full_models <- sum(!vapply(x[["model_obj_full"]], is.null, logical(1))) + learner_ids <- unique(x[["learner_id"]]) + n_with_crossval <- sum(!vapply(x[["crossval_predictions"]], is.null, logical(1))) + n_with_full <- sum(!vapply(x[["learner_full"]], is.null, logical(1))) cat(glue::glue( "Transition Models Table\n", "Total models: {nrow(x)}\n", "Transitions: {n_trans}\n", - "Model families: {paste(model_families, collapse = ', ')}\n", - "With partial models: {n_with_part_models}, With full models: {n_with_full_models}\n\n" + "Learners: {paste(learner_ids, collapse = ', ')}\n", + "With cross-val predictions: {n_with_crossval}, With full models: {n_with_full}\n\n" )) } else { cat("Transition Models Table (empty)\n") @@ -454,3 +499,46 @@ print.trans_models_t <- function(x) { print_rowwise_yaml(x) invisible(x) } + +#' @describeIn trans_models_t Deserialize cross-validation predictions and return +#' plots via `mlr3viz::autoplot()`. Requires the `mlr3viz` package. +#' @param self [evoland_db] instance +#' @param id_run Optional integer; filter by run ID. +#' @param id_trans Optional integer; filter by transition ID. +get_crossval_plots <- function(self, id_run = NULL, id_trans = NULL) { + if (!requireNamespace("mlr3viz", quietly = TRUE)) { + stop("Package 'mlr3viz' is required. Install with: install.packages('mlr3viz')") + } + + where_clauses <- c() + if (!is.null(id_run)) { + where_clauses <- c(where_clauses, glue::glue("id_run = {id_run}")) + } + if (!is.null(id_trans)) { + where_clauses <- c(where_clauses, glue::glue("id_trans = {id_trans}")) + } + where <- if (length(where_clauses) > 0L) paste(where_clauses, collapse = " and ") else NULL + + models <- self$fetch( + "trans_models_t", + cols = c("id_run", "id_trans", "learner_id", "crossval_predictions"), + where = where + ) + + plots <- lapply(seq_len(nrow(models)), function(i) { + pred_blob <- models$crossval_predictions[[i]] + if (is.null(pred_blob)) { + return(NULL) + } + prediction <- qs2::qs_deserialize(pred_blob) + mlr3viz::autoplot(prediction) + }) + + names(plots) <- paste0( + "id_run=", models$id_run, + "_id_trans=", models$id_trans, + "_", models$learner_id + ) + + plots +} diff --git a/R/trans_pot_t.R b/R/trans_pot_t.R index 75e351e..d71a36b 100644 --- a/R/trans_pot_t.R +++ b/R/trans_pot_t.R @@ -113,10 +113,10 @@ predict_trans_pot <- function( # Get model for this transition model_row <- self$get_query(glue::glue( r"[ - select model_obj_full + select learner_full from {self$get_read_expr("trans_models_t")} where id_trans = {id_trans} - order by goodness_of_fit['{gof_criterion}'] {ifelse(gof_maximize, "desc", "asc")} + order by crossval_measures['{gof_criterion}'] {ifelse(gof_maximize, "desc", "asc")} limit 1 ]" )) @@ -125,8 +125,8 @@ predict_trans_pot <- function( stop(glue::glue("Expecting exactly one model for id_trans={id_trans}")) } - # Deserialize full model - model_obj <- qs2::qs_deserialize(model_row$model_obj_full[[1]]) + # Deserialize full learner + learner_obj <- qs2::qs_deserialize(model_row$learner_full[[1]]) # Get predictor data for id_period_post at coords with id_lulc_ant at id_period_post - 1 pred_data_post <- self$pred_data_wide_v( @@ -141,33 +141,10 @@ predict_trans_pot <- function( next } - # Predict probabilities - # Drop id_coord for prediction + # Predict probabilities using mlr3 predict_newdata; drop id_coord (non-feature) pred_cols <- grep("^id_pred_", names(pred_data_post), value = TRUE) - - # Predict - assuming model has predict() method that returns probabilities - # FIXME apparently not all models respect the standard function signature, pull in - # mlr3 - if (inherits(model_obj, "ranger")) { - probs <- - predict( - model_obj, - data = pred_data_post[, -"id_coord"], - type = "response" - )[[ - "predictions" # assume model has been run with probability = TRUE - ]][, - "TRUE" # matrix column for transition _does_ occur - ] - } else { - probs <- - predict( - model_obj, - newdata = pred_data_post[, -"id_coord"], - type = "response" - ) |> - setNames(NULL) - } + pred_feature_data <- pred_data_post[, .SD, .SDcols = pred_cols] + probs <- learner_obj$predict_newdata(pred_feature_data)$prob[, "TRUE"] # Ensure probabilities are in [0, 1] probs <- pmax(0, pmin(1, probs)) diff --git a/inst/tinytest/test_integ_allocation.R b/inst/tinytest/test_integ_allocation.R index 06884d7..0dd1429 100644 --- a/inst/tinytest/test_integ_allocation.R +++ b/inst/tinytest/test_integ_allocation.R @@ -5,6 +5,10 @@ if (!at_home()) { exit_file("Integration tests skipped (not at_home)") } +if (!requireNamespace("mlr3", quietly = TRUE)) { + exit_file("mlr3 not available; skipping allocation integration tests") +} + source(file.path(system.file("tinytest", package = "evoland"), "helper_testdb.R")) db <- make_test_db() db$trans_rates_t <- db$get_obs_trans_rates() @@ -13,18 +17,24 @@ db$trans_rates_t <- extrapolate_trans_rates( db$periods_t, coord_count = nrow(db$coords_t) ) -# test the package's standard glm quasibinomial fit and append to disk + +test_learner <- mlr3::lrn("classif.featureless", predict_type = "prob") +test_measures <- list(mlr3::msr("classif.auc")) + +# test the package's featureless learner fit and append to disk expect_message( db$trans_models_t <- db$fit_partial_models( - fit_fun = fit_glm, - gof_fun = gof_glm, + learner = test_learner, + measures = test_measures, seed = 1244244, ), "Fitting partial models for 2 transitions..." ) expect_message( db$trans_models_t <- db$fit_full_models( - gof_criterion = "auc", + learner = test_learner, + measures = test_measures, + gof_criterion = "classif.auc", gof_maximize = TRUE ), "Fitting full models for" diff --git a/inst/tinytest/test_integ_trans_models_t.R b/inst/tinytest/test_integ_trans_models_t.R index b0f972e..0308d57 100644 --- a/inst/tinytest/test_integ_trans_models_t.R +++ b/inst/tinytest/test_integ_trans_models_t.R @@ -7,19 +7,21 @@ expect_stdout(print(as_trans_models_t()), "Transition Models Table") trans_models_t <- as_trans_models_t(data.table::data.table( id_run = 1000L, id_trans = 1L, - model_family = "rf", - model_params = list( - list(depth = 100, ntrees = 500) + learner_id = "classif.featureless", + learner_params = list( + list(method = "mode") ), - goodness_of_fit = list( - list(auc = 0.8, rmse = 0.15) + learner_spec = list( + charToRaw("learner spec blob") ), - fit_call = "fit_fun(data = data)", - model_obj_part = list( - charToRaw("partial model data") + crossval_measures = list( + list(classif.auc = 0.8) ), - model_obj_full = list( - charToRaw("full model data") + crossval_predictions = list( + charToRaw("predictions blob") + ), + learner_full = list( + charToRaw("full learner blob") ) )) expect_equal(nrow(trans_models_t), 1L) @@ -30,6 +32,10 @@ if (!at_home()) { exit_file("Integration tests skipped (not at_home)") } +if (!requireNamespace("mlr3", quietly = TRUE)) { + exit_file("mlr3 not available; skipping integration tests") +} + # Load fixtures via helper source(file.path( system.file("tinytest", package = "evoland"), @@ -37,194 +43,118 @@ source(file.path( )) db <- make_test_db(include_neighbors = FALSE, include_trans_preds = TRUE) -# Test fit_partial_models and fit_full_models workflow -# Define a simple mock fit function for testing -fit_mock_glm <- function(data, ...) { - pred_cols <- grep("^id_pred_", names(data), value = TRUE) - - if (length(pred_cols) == 0) { - stop("No predictor columns found") - } - - # Create a simple formula - formula_str <- paste("did_transition", "~", paste(pred_cols, collapse = " + ")) - formula <- as.formula(formula_str) - - # Fit a simple GLM - model <- glm(formula, data = data, family = binomial()) - - return(model) -} - -# Define a goodness of fit function -gof_mock <- function(model, test_data) { - predictions <- predict(model, newdata = test_data, type = "response") - actual <- test_data[["did_transition"]] - - # Simple correlation-based metric - cor_metric <- cor(predictions, actual, use = "complete.obs") - - # Mean squared error - mse <- mean((predictions - actual)^2, na.rm = TRUE) - - list( - cor = cor_metric, - mse = mse, - n_test = nrow(test_data) - ) -} +# Use a simple featureless learner for fast, dependency-free testing +test_learner <- mlr3::lrn("classif.featureless", predict_type = "prob") +test_measures <- list(mlr3::msr("classif.auc"), mlr3::msr("classif.acc")) # Test fit_partial_models expect_message( db$trans_models_t <- partial_models <- db$fit_partial_models( - fit_fun = fit_mock_glm, - gof_fun = gof_mock, + learner = test_learner, + measures = test_measures, sample_frac = 0.7, - seed = 123, - other_param = "nonce" + seed = 123 ), "Fitting partial models for 2 transitions..." ) expect_length( partial_models, - 8L # columns; length is NULL if all fail -) -expect_match( - partial_models$fit_call, - r"{function \(data, ..., other_param = "nonce"\)}" -) -expect_equal( - partial_models$model_params, - list( - list(n_predictors = 4L, n_train = 514L, sample_frac = 0.7, other_param = "nonce"), - list(n_predictors = 4L, n_train = 748L, sample_frac = 0.7, other_param = "nonce") - ) + 8L # columns ) expect_true(all( - vapply(partial_models$model_obj_part, is.raw, logical(1)) + c("id_run", "id_trans", "learner_id", "learner_params", + "learner_spec", "crossval_measures", "crossval_predictions", "learner_full") %in% + names(partial_models) )) expect_equal( - partial_models$goodness_of_fit, - list( - list(cor = 0.0224227324563254, mse = 0.250509844417008, n_test = 219L), - list(cor = 0.0267497369302788, mse = 0.248782532059352, n_test = 319L) - ), - tolerance = 1e-6 + unique(partial_models$learner_id), + "classif.featureless" ) +expect_true(all( + vapply(partial_models$learner_spec, is.raw, logical(1)) +)) +expect_true(all( + vapply(partial_models$crossval_predictions, is.raw, logical(1)) +)) +expect_true(all( + vapply(partial_models$learner_full, is.null, logical(1)) +)) -# Test that model deserialization works -expect_inherits( - qs2::qs_deserialize(partial_models$model_obj_part[[1]]), - "glm" -) +# Test that learner_spec deserializes to an mlr3 Learner +deserialized_spec <- qs2::qs_deserialize(partial_models$learner_spec[[1]]) +expect_true(inherits(deserialized_spec, "Learner")) +expect_equal(deserialized_spec$id, "classif.featureless") -# Test fit_full_models, which ensures we can retrieve and evaluate the embedded model function -expect_message( - db$fit_full_models( - gof_criterion = "cor", - gof_maximize = TRUE, - ), - "Fitting full models for" -) +# crossval_measures should be named lists with measure IDs as keys +expect_true(all(vapply( + partial_models$crossval_measures, + function(m) !is.null(m) && is.list(m) && "classif.auc" %in% names(m), + logical(1) +))) -# test the package's standard rf fit and append to disk -expect_message( - db$trans_models_t <- db$fit_partial_models( - fit_fun = fit_ranger, - gof_fun = gof_ranger, - seed = 1244244, - ) -) -# test the package's standard glm quasibinomial fit -expect_message( - db$fit_partial_models( - fit_fun = fit_glm, - gof_fun = gof_glm, - seed = 1244244, - ) -) +# Test fit_full_models, which reconstructs and retrains on full data expect_message( db$trans_models_t <- full_models <- db$fit_full_models( - gof_criterion = "auc", + learner = test_learner, + measures = test_measures, + gof_criterion = "classif.auc", gof_maximize = TRUE ), "Fitting full models for" ) -# test DB round trip +# Test DB round trip expect_equal(nrow(full_models), 2L) -expect_equal(db$row_count("trans_models_t"), 4L) -full_mods_roundtrip <- db$trans_models_t[id_trans == 2L & model_family == "ranger"] -data.table::setattr( - full_mods_roundtrip, - "parquet_db_t_class", - NULL -) # remove attribute for testing equivalence +full_mods_roundtrip <- db$trans_models_t[id_trans == 2L & learner_id == "classif.featureless"] +data.table::setattr(full_mods_roundtrip, "parquet_db_t_class", NULL) expect_identical( full_mods_roundtrip, - full_models[id_trans == 2L & model_family == "ranger", ] + full_models[id_trans == 2L & learner_id == "classif.featureless"] ) -# Check that both partial and full models are present -expect_true(all(!vapply(full_models$model_obj_part, is.null, logical(1)))) -expect_true(all(!vapply(full_models$model_obj_full, is.null, logical(1)))) +# Check that both crossval_predictions and learner_full are present +expect_true(all(!vapply(full_models$crossval_predictions, is.null, logical(1)))) +expect_true(all(!vapply(full_models$learner_full, is.null, logical(1)))) -# Test that full model deserialization works -expect_inherits( - qs2::qs_deserialize(full_models$model_obj_full[[1]]), - "ranger" -) +# Test that learner_full deserializes to a trained mlr3 Learner +deserialized_full <- qs2::qs_deserialize(full_models$learner_full[[1]]) +expect_true(inherits(deserialized_full, "Learner")) +expect_false(is.null(deserialized_full$model)) # Test model selection with minimize criterion expect_message( full_models_min <- db$fit_full_models( - gof_criterion = "mse", + learner = test_learner, + measures = test_measures, + gof_criterion = "classif.acc", gof_maximize = FALSE ), "Fitting full models for" ) -# Test error handling - missing fit_fun parameter -expect_error( - db$fit_partial_models( - gof_fun = gof_mock, - sample_frac = 0.7 - ), - "argument \"fit_fun\" is missing" -) - -# Test error handling - missing gof_fun parameter +# Test error handling - missing learner parameter expect_error( db$fit_partial_models( - fit_fun = fit_mock_glm, + measures = test_measures, sample_frac = 0.7 ), - "argument \"gof_fun\" is missing" -) - -# Test error handling - fit_fun is not a function -expect_error( - db$fit_partial_models( - fit_fun = "not_a_function", - gof_fun = gof_mock - ), - "fit_fun must be a function" + "argument \"learner\" is missing" ) -# Test error handling - gof_fun is not a function +# Test error handling - learner is not an mlr3 Learner expect_error( db$fit_partial_models( - fit_fun = fit_mock_glm, - gof_fun = "not_a_function" + learner = "not_a_learner", + measures = test_measures ), - "gof_fun must be a function" + "learner must be an mlr3 Learner or AutoTuner" ) # Test error handling - invalid sample_frac expect_error( db$fit_partial_models( - fit_fun = fit_mock_glm, - gof_fun = gof_mock, + learner = test_learner, + measures = test_measures, sample_frac = 0 ), "sample_frac must be between 0 and 1" @@ -232,8 +162,8 @@ expect_error( expect_error( db$fit_partial_models( - fit_fun = fit_mock_glm, - gof_fun = gof_mock, + learner = test_learner, + measures = test_measures, sample_frac = 1 ), "sample_frac must be between 0 and 1" @@ -243,48 +173,55 @@ expect_error( db$delete_from("trans_models_t") expect_error( db$fit_full_models( - gof_criterion = "cor", + learner = test_learner, + measures = test_measures, + gof_criterion = "classif.auc", gof_maximize = TRUE ), "trans_models_t is missing" ) -# Test fit function that throws an error -fit_error <- function(data, ...) { - stop("Intentional error for testing") -} +# Test fit function that throws an error (simulate via a bad learner) +# We achieve this by running with a learner that fails (no viable data after +# deleting the table above means we re-populate and then use a bad setup). +# Instead, test via a mock that warns on error using an out-of-range sample_frac +# -> use a real error scenario by removing all preds +db2 <- make_test_db(include_neighbors = FALSE, include_trans_preds = FALSE) +db2$trans_preds_t <- as_trans_preds_t(data.table::data.table( + id_run = 0L, + id_pred = 99999L, # non-existent predictor + id_trans = 1L +)) expect_warning( partial_models_error <- - db$fit_partial_models( - fit_fun = fit_error, - gof_fun = gof_mock, + db2$fit_partial_models( + learner = test_learner, + measures = test_measures, sample_frac = 0.7, seed = 123 ), - "Intentional error for testing" + "No predictor columns|No data" ) - -# Should return NULL when all transitions fail expect_equal( - partial_models_error, - as_trans_models_t(data.table::data.table( - id_run = 0L, - id_trans = 1:2, - model_family = "error", - model_params = list(NULL), - goodness_of_fit = list( - list(failed = TRUE, message = "Intentional error for testing"), - list(failed = TRUE, message = "Intentional error for testing") - ), - fit_call = "function (data, ...) \n {\n stop(\"Intentional error for testing\")\n }", - model_obj_part = list(NULL), - model_obj_full = list(NULL) - )) + partial_models_error$learner_id, + "error" ) # Test print method +db3 <- make_test_db(include_neighbors = FALSE, include_trans_preds = TRUE) +db3$trans_models_t <- db3$fit_partial_models( + learner = test_learner, + measures = test_measures, + seed = 42 +) +db3$trans_models_t <- db3$fit_full_models( + learner = test_learner, + measures = test_measures, + gof_criterion = "classif.auc", + gof_maximize = TRUE +) expect_stdout( - print(full_models), + print(db3$trans_models_t), "Transition Models Table|Total models" ) From dcf8d494c5a2a69d3593f2de384ef81ef3dcef78 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 16 Apr 2026 13:38:08 +0000 Subject: [PATCH 03/15] Rename l_id/l_params/l_spec to clearer names per code review Agent-Logs-Url: https://github.com/ethzplus/evoland-plus/sessions/16192889-e909-4511-bc6c-4fc80926b365 Co-authored-by: mmyrte <24587121+mmyrte@users.noreply.github.com> --- R/trans_models_t.R | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/R/trans_models_t.R b/R/trans_models_t.R index cdf6adf..6a73873 100644 --- a/R/trans_models_t.R +++ b/R/trans_models_t.R @@ -139,20 +139,20 @@ fit_partial_model_worker <- function( trained_learner } - l_id <- extract_from$id - l_params <- Filter( + learner_id_val <- extract_from$id + learner_params_val <- Filter( function(v) is.atomic(v) && length(v) == 1L, extract_from$param_set$values ) - l_params <- if (length(l_params) == 0L) NULL else l_params - l_spec <- qs2::qs_serialize(extract_from$clone(deep = TRUE)$reset()) + learner_params_val <- if (length(learner_params_val) == 0L) NULL else learner_params_val + learner_spec_blob <- qs2::qs_serialize(extract_from$clone(deep = TRUE)$reset()) data.table::data.table( id_run = item[["id_run"]], id_trans = item[["id_trans"]], - learner_id = l_id, - learner_params = list(l_params), - learner_spec = list(l_spec), + learner_id = learner_id_val, + learner_params = list(learner_params_val), + learner_spec = list(learner_spec_blob), crossval_measures = list(scores), crossval_predictions = list(qs2::qs_serialize(prediction)), learner_full = list(NULL) @@ -210,17 +210,17 @@ fit_full_model_worker <- function(item, db, ...) { # Reconstruct learner: try learner_spec first, fall back to do.call(lrn, ...) learner_spec_raw <- item[["learner_spec"]][[1L]] - l_id <- item[["learner_id"]] - l_params <- item[["learner_params"]][[1L]] + learner_id_val <- item[["learner_id"]] + learner_params_val <- item[["learner_params"]][[1L]] trained_learner <- tryCatch( qs2::qs_deserialize(learner_spec_raw), error = function(e) { warning(glue::glue( - "learner_spec deserialization failed for {l_id}: {e$message}; ", + "learner_spec deserialization failed for {learner_id_val}: {e$message}; ", "falling back to do.call reconstruction" )) - do.call(mlr3::lrn, c(list(l_id), as.list(l_params))) + do.call(mlr3::lrn, c(list(learner_id_val), as.list(learner_params_val))) } ) @@ -229,7 +229,7 @@ fit_full_model_worker <- function(item, db, ...) { list( id_run = item[["id_run"]], id_trans = item[["id_trans"]], - learner_id = l_id, + learner_id = learner_id_val, learner_full = list(qs2::qs_serialize(trained_learner)) ) }, From f6555a6dff4f8049db8ac8b0ac65a6e50c071df5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 17 Apr 2026 21:33:02 +0000 Subject: [PATCH 04/15] Address review: crossval_score rename, mlr3 to Imports, warning, docs, char measures, no-copy predict Agent-Logs-Url: https://github.com/ethzplus/evoland-plus/sessions/0bd7d123-2149-49b7-b96d-7fb7ff71f3ac Co-authored-by: mmyrte <24587121+mmyrte@users.noreply.github.com> --- DESCRIPTION | 2 +- R/trans_models_t.R | 51 +++++++++++++++-------- R/trans_pot_t.R | 8 ++-- inst/tinytest/test_integ_trans_models_t.R | 11 ++--- 4 files changed, 43 insertions(+), 29 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index fac40ea..622ea9c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,6 +19,7 @@ Imports: DBI, duckdb (>= 1.5.2), glue, + mlr3, qs2, R6, Rcpp, @@ -26,7 +27,6 @@ Imports: Suggests: base64enc, butcher, - mlr3, mlr3viz, pROC, processx, diff --git a/R/trans_models_t.R b/R/trans_models_t.R index 6a73873..d1e6aa5 100644 --- a/R/trans_models_t.R +++ b/R/trans_models_t.R @@ -16,7 +16,7 @@ #' querying; complete hyperparameters are captured by `learner_spec` #' - `learner_spec`: BLOB of serialized untrained mlr3 `Learner`; for #' AutoTuners, this is the optimal inner learner after tuning -#' - `crossval_measures`: MAP of cross-validation performance scores +#' - `crossval_score`: MAP of cross-validation performance scores #' (from `prediction$score(measures)`) #' - `crossval_predictions`: BLOB of serialized mlr3 `PredictionClassif` #' on the held-out test split @@ -31,7 +31,7 @@ as_trans_models_t <- function(x) { learner_id = character(0), learner_params = list(), learner_spec = list(), - crossval_measures = list(), + crossval_score = list(), crossval_predictions = list(), learner_full = list() ) @@ -45,7 +45,7 @@ as_trans_models_t <- function(x) { x, "trans_models_t", key_cols = c("id_run", "id_trans", "learner_id"), - map_cols = c("learner_params", "crossval_measures"), + map_cols = c("learner_params", "crossval_score"), partition_cols = "id_run" ) } @@ -153,7 +153,7 @@ fit_partial_model_worker <- function( learner_id = learner_id_val, learner_params = list(learner_params_val), learner_spec = list(learner_spec_blob), - crossval_measures = list(scores), + crossval_score = list(scores), crossval_predictions = list(qs2::qs_serialize(prediction)), learner_full = list(NULL) ) @@ -168,7 +168,7 @@ fit_partial_model_worker <- function( learner_id = "error", learner_params = list(NULL), learner_spec = list(NULL), - crossval_measures = list(NULL), + crossval_score = list(NULL), crossval_predictions = list(NULL), learner_full = list(NULL) ) @@ -216,11 +216,12 @@ fit_full_model_worker <- function(item, db, ...) { trained_learner <- tryCatch( qs2::qs_deserialize(learner_spec_raw), error = function(e) { + fallback <- do.call(mlr3::lrn, c(list(learner_id_val), as.list(learner_params_val))) warning(glue::glue( "learner_spec deserialization failed for {learner_id_val}: {e$message}; ", - "falling back to do.call reconstruction" + "falling back to reconstructed learner: {fallback$format()}" )) - do.call(mlr3::lrn, c(list(learner_id_val), as.list(learner_params_val))) + fallback } ) @@ -254,8 +255,10 @@ fit_full_model_worker <- function(item, db, ...) { #' @param learner An mlr3 `Learner` or `AutoTuner` object. A deep clone is trained #' for each transition; the original object is not modified. For `AutoTuner`, #' the optimal inner learner is extracted after tuning. -#' @param measures A list of mlr3 `Measure` objects (e.g. `list(mlr3::msr("classif.auc"))`) -#' used to score the held-out predictions; results are written to `crossval_measures`. +#' @param measures Either a character vector of mlr3 measure IDs +#' (e.g. `c("classif.auc", "classif.acc")`) or a list of instantiated mlr3 +#' `Measure` objects (e.g. `list(mlr3::msr("classif.auc"))`). Character IDs are +#' converted via `mlr3::msrs()` internally. Results are written to `crossval_score`. #' @param sample_frac Numeric between 0 and 1 indicating #' the fraction of data to use for training the partial models. The rest is used for #' testing and calculating goodness-of-fit metrics. Default is 0.7 (70% training, 30% @@ -263,6 +266,9 @@ fit_full_model_worker <- function(item, db, ...) { #' @param seed Optional integer seed for reproducible subsampling. #' @param cluster An optional cluster object created by [parallel::makeCluster()] or #' [mirai::make_cluster()]. +#' @return A [trans_models_t] table with one row per viable transition, containing +#' the learner identity, serialized spec, cross-validation scores (`crossval_score`), +#' and serialized held-out predictions (`crossval_predictions`). fit_partial_models <- function( self, learner, @@ -271,6 +277,11 @@ fit_partial_models <- function( seed = NULL, cluster = NULL ) { + # Accept either a character vector of measure IDs or a list of Measure objects + if (is.character(measures)) { + measures <- mlr3::msrs(measures) + } + trans_preds_nested <- data.table::as.data.table(self$trans_preds_t)[, .(id_pred = list(id_pred)), @@ -289,7 +300,7 @@ fit_partial_models <- function( stopifnot( "No viable transitions" = nrow(viable_trans) > 0L, "learner must be an mlr3 Learner or AutoTuner" = inherits(learner, "Learner"), - "measures must be a non-empty list" = is.list(measures) && length(measures) > 0L, + "measures must be a non-empty character vector or list" = (is.list(measures) || is.character(measures)) && length(measures) > 0L, "sample_frac must be between 0 and 1" = sample_frac > 0 && sample_frac < 1 ) @@ -320,14 +331,18 @@ fit_partial_models <- function( #' @param learner An mlr3 `Learner` or `AutoTuner` object; kept for API consistency and #' used as a last-resort fallback if both `learner_spec` deserialization and #' `do.call(mlr3::lrn, ...)` reconstruction fail. -#' @param measures A list of mlr3 `Measure` objects; kept for API consistency. -#' @param gof_criterion Character string specifying which cross-validation measure to use +#' @param measures Either a character vector of mlr3 measure IDs or a list of `Measure` +#' objects; kept for API consistency. +#' @param gof_criterion Character string specifying which cross-validation score to use #' for selecting the best partial model per transition (must match a key in -#' `crossval_measures`, e.g. `"classif.auc"`). +#' `crossval_score`, e.g. `"classif.auc"`). #' @param gof_maximize Logical; select the model with the maximum (`TRUE`) or minimum #' (`FALSE`) value of `gof_criterion`. Default is `TRUE`. #' @param cluster An optional cluster object created by [parallel::makeCluster()] or #' [mirai::make_cluster()]. +#' @return A [trans_models_t] table with one row per transition, containing the columns +#' from the best partial model plus `learner_full` with the serialized fully-trained +#' learner. fit_full_models <- function( self, learner, @@ -369,7 +384,7 @@ fit_full_models <- function( and pn.id_trans = tm.id_trans qualify row_number() over ( partition by tm.id_run, tm.id_trans - order by tm.crossval_measures['{gof_criterion}'] {ifelse(gof_maximize, "desc", "asc")} + order by tm.crossval_score['{gof_criterion}'] {ifelse(gof_maximize, "desc", "asc")} ) = 1; ]" )) @@ -414,7 +429,7 @@ fit_full_models <- function( "learner_id", "learner_params", "learner_spec", - "crossval_measures", + "crossval_score", "crossval_predictions" ), where = glue::glue( @@ -430,7 +445,7 @@ fit_full_models <- function( `:=`( learner_params = i.learner_params, learner_spec = i.learner_spec, - crossval_measures = i.crossval_measures, + crossval_score = i.crossval_score, crossval_predictions = i.crossval_predictions ) ] |> @@ -450,7 +465,7 @@ validate.trans_models_t <- function(x, ...) { "learner_id", "learner_params", "learner_spec", - "crossval_measures", + "crossval_score", "crossval_predictions", "learner_full" ) @@ -467,7 +482,7 @@ validate.trans_models_t <- function(x, ...) { is.character(x[["learner_id"]]), is.list(x[["learner_params"]]), is.list(x[["learner_spec"]]), - is.list(x[["crossval_measures"]]), + is.list(x[["crossval_score"]]), is.list(x[["crossval_predictions"]]), is.list(x[["learner_full"]]), all(x[["id_trans"]] > 0), diff --git a/R/trans_pot_t.R b/R/trans_pot_t.R index d71a36b..f158d4e 100644 --- a/R/trans_pot_t.R +++ b/R/trans_pot_t.R @@ -116,7 +116,7 @@ predict_trans_pot <- function( select learner_full from {self$get_read_expr("trans_models_t")} where id_trans = {id_trans} - order by crossval_measures['{gof_criterion}'] {ifelse(gof_maximize, "desc", "asc")} + order by crossval_score['{gof_criterion}'] {ifelse(gof_maximize, "desc", "asc")} limit 1 ]" )) @@ -141,10 +141,8 @@ predict_trans_pot <- function( next } - # Predict probabilities using mlr3 predict_newdata; drop id_coord (non-feature) - pred_cols <- grep("^id_pred_", names(pred_data_post), value = TRUE) - pred_feature_data <- pred_data_post[, .SD, .SDcols = pred_cols] - probs <- learner_obj$predict_newdata(pred_feature_data)$prob[, "TRUE"] + # Predict probabilities using mlr3 predict_newdata; id_coord is dropped automatically + probs <- learner_obj$predict_newdata(pred_data_post)$prob[, "TRUE"] # Ensure probabilities are in [0, 1] probs <- pmax(0, pmin(1, probs)) diff --git a/inst/tinytest/test_integ_trans_models_t.R b/inst/tinytest/test_integ_trans_models_t.R index 0308d57..2b56ecc 100644 --- a/inst/tinytest/test_integ_trans_models_t.R +++ b/inst/tinytest/test_integ_trans_models_t.R @@ -14,7 +14,7 @@ trans_models_t <- as_trans_models_t(data.table::data.table( learner_spec = list( charToRaw("learner spec blob") ), - crossval_measures = list( + crossval_score = list( list(classif.auc = 0.8) ), crossval_predictions = list( @@ -45,7 +45,8 @@ db <- make_test_db(include_neighbors = FALSE, include_trans_preds = TRUE) # Use a simple featureless learner for fast, dependency-free testing test_learner <- mlr3::lrn("classif.featureless", predict_type = "prob") -test_measures <- list(mlr3::msr("classif.auc"), mlr3::msr("classif.acc")) +# measures can be passed as a character vector of IDs (convenience) or as a list of Measure objects +test_measures <- c("classif.auc", "classif.acc") # Test fit_partial_models expect_message( @@ -63,7 +64,7 @@ expect_length( ) expect_true(all( c("id_run", "id_trans", "learner_id", "learner_params", - "learner_spec", "crossval_measures", "crossval_predictions", "learner_full") %in% + "learner_spec", "crossval_score", "crossval_predictions", "learner_full") %in% names(partial_models) )) expect_equal( @@ -85,9 +86,9 @@ deserialized_spec <- qs2::qs_deserialize(partial_models$learner_spec[[1]]) expect_true(inherits(deserialized_spec, "Learner")) expect_equal(deserialized_spec$id, "classif.featureless") -# crossval_measures should be named lists with measure IDs as keys +# crossval_score should be named lists with measure IDs as keys expect_true(all(vapply( - partial_models$crossval_measures, + partial_models$crossval_score, function(m) !is.null(m) && is.list(m) && "classif.auc" %in% names(m), logical(1) ))) From d7393dff698155f1a6bbfbd11ad911ad6f6f8caa Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 17 Apr 2026 21:33:54 +0000 Subject: [PATCH 05/15] Fix fallback format() collapse and strengthen measures validation Agent-Logs-Url: https://github.com/ethzplus/evoland-plus/sessions/0bd7d123-2149-49b7-b96d-7fb7ff71f3ac Co-authored-by: mmyrte <24587121+mmyrte@users.noreply.github.com> --- R/trans_models_t.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/R/trans_models_t.R b/R/trans_models_t.R index d1e6aa5..ccd994c 100644 --- a/R/trans_models_t.R +++ b/R/trans_models_t.R @@ -219,7 +219,7 @@ fit_full_model_worker <- function(item, db, ...) { fallback <- do.call(mlr3::lrn, c(list(learner_id_val), as.list(learner_params_val))) warning(glue::glue( "learner_spec deserialization failed for {learner_id_val}: {e$message}; ", - "falling back to reconstructed learner: {fallback$format()}" + "falling back to reconstructed learner: {paste(fallback$format(), collapse = ' ')}" )) fallback } @@ -300,7 +300,10 @@ fit_partial_models <- function( stopifnot( "No viable transitions" = nrow(viable_trans) > 0L, "learner must be an mlr3 Learner or AutoTuner" = inherits(learner, "Learner"), - "measures must be a non-empty character vector or list" = (is.list(measures) || is.character(measures)) && length(measures) > 0L, + "measures must be a non-empty character vector or list of Measure objects" = ( + (is.character(measures) || (is.list(measures) && all(vapply(measures, inherits, logical(1), "Measure")))) && + length(measures) > 0L + ), "sample_frac must be between 0 and 1" = sample_frac > 0 && sample_frac < 1 ) From 9930643ea37bee1e66430dadbc9eb2ffb6bd3761 Mon Sep 17 00:00:00 2001 From: Jan Hartman <24587121+mmyrte@users.noreply.github.com> Date: Sat, 18 Apr 2026 09:38:16 +0200 Subject: [PATCH 06/15] air autoformat Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- R/trans_models_t.R | 18 +++++++++++------- inst/tinytest/test_integ_trans_models_t.R | 12 ++++++++++-- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/R/trans_models_t.R b/R/trans_models_t.R index ccd994c..1ec4bfc 100644 --- a/R/trans_models_t.R +++ b/R/trans_models_t.R @@ -300,10 +300,11 @@ fit_partial_models <- function( stopifnot( "No viable transitions" = nrow(viable_trans) > 0L, "learner must be an mlr3 Learner or AutoTuner" = inherits(learner, "Learner"), - "measures must be a non-empty character vector or list of Measure objects" = ( - (is.character(measures) || (is.list(measures) && all(vapply(measures, inherits, logical(1), "Measure")))) && - length(measures) > 0L - ), + "measures must be a non-empty character vector or list of Measure objects" = ((is.character( + measures + ) || + (is.list(measures) && all(vapply(measures, inherits, logical(1), "Measure")))) && + length(measures) > 0L), "sample_frac must be between 0 and 1" = sample_frac > 0 && sample_frac < 1 ) @@ -553,9 +554,12 @@ get_crossval_plots <- function(self, id_run = NULL, id_trans = NULL) { }) names(plots) <- paste0( - "id_run=", models$id_run, - "_id_trans=", models$id_trans, - "_", models$learner_id + "id_run=", + models$id_run, + "_id_trans=", + models$id_trans, + "_", + models$learner_id ) plots diff --git a/inst/tinytest/test_integ_trans_models_t.R b/inst/tinytest/test_integ_trans_models_t.R index 2b56ecc..5600065 100644 --- a/inst/tinytest/test_integ_trans_models_t.R +++ b/inst/tinytest/test_integ_trans_models_t.R @@ -63,8 +63,16 @@ expect_length( 8L # columns ) expect_true(all( - c("id_run", "id_trans", "learner_id", "learner_params", - "learner_spec", "crossval_score", "crossval_predictions", "learner_full") %in% + c( + "id_run", + "id_trans", + "learner_id", + "learner_params", + "learner_spec", + "crossval_score", + "crossval_predictions", + "learner_full" + ) %in% names(partial_models) )) expect_equal( From cf15553161211d7610562eb12d618a9141f6e6a4 Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Tue, 21 Apr 2026 19:58:33 +0200 Subject: [PATCH 07/15] more concise AGENTS.md --- .gitignore | 1 - AGENTS.md | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++ rproject.toml | 16 ++++++++++ 3 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 AGENTS.md create mode 100644 rproject.toml diff --git a/.gitignore b/.gitignore index 1ebed01..147b726 100644 --- a/.gitignore +++ b/.gitignore @@ -38,7 +38,6 @@ vignettes/*.pdf # virtual environments renv* rv* -rproject.toml # ignore duckdb database files and parquet *db diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..77b2634 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,84 @@ +# Development instructions + +If you are a human, take these instructions as guidance. +If you are an agent, follow these instructions. +Ask for clarifications where specifications are unclear or edge cases crop up. + +This is an R package that is used for land use / land cover change analyses and simulations. +It builds on the approaches of [lulcc](https://github.com/simonmoulds/lulcc), [clumpy](https://github.com/mmyrte/clumpy), and [Dinamica EGO](https://dinamicaego.com/). + +## Project layout + +- `R/parquet_db.R`, `R/parquet_db_utils.R`: generic parquet-backed DB layer +- `R/evoland_db.R`, `R/evoland_db_util.R`, `R/evoland_db_views.R`: domain DB and views (inherits parquet_db); register new methods using `create_method_binding` in `evoland_db` constructor +- `R/*_t.R`: table type definitions (`as_*_t` constructors) +- `R/util*.R`: internal helpers +- `R/*.R` (remaining): domain logic (allocation, filtering, modelling) +- `src/`: Rcpp / C++ code +- `inst/tinytest/`: tests +- `inst/dinamica_models/`: Dinamica EGO models +- `inst/*.sql`: SQL views; use glue for string interpolation +- `data-raw/`: raw data and test fixture generation scripts +- `man/`: documentation generated by roxygen2 +- `vignettes/*.qmd`: vignettes and how-tos + +## Environment setup + +Use [rv](https://a2-ai.github.io/rv-docs/). Run `rv sync` to install all dependencies (see `rproject.toml`). +Recompile and attach the package using `pkgload::load_all()`. + +## Conventions and Style + +Prefer base R solutions over external dependencies where they are equally readable. +Avoid new dependencies. +Avoid packages from the tidyverse, as their APIs tend to be unstable. +Avoid niche packages that are seldom maintained; if the functionality is simple, rather implement it as a non-exported utility function. + +Prefer exact double bracket `[["name"]]` list subsetting over `$name`, which does partial matching. + +Use `air` as autoformatter (`./air.toml`). +Use `lintr` for linting (`./.lintr`). +If running in an IDE, rely on language server notes. +Prefer explanatory names over comments in your code. + +Let errors propagate naturally; only add early guards when the default error message would confuse the user. +Use the `stopifnot("error message" = condition)` pattern for assertions. + +## Testing + +Use tinytest, see `inst/tinytest`. +Non-exported functions need to be tested as `evoland:::private_function`, because a package can be tested after installation. +Be parsimonious when constructing tests. + +- Test the full package using `R -e "tinytest::build_test_install()"`. +- Test individual files using `R -e "pkgload::load_all(); tinytest::run_test_file('inst/tinytest/somefile.R')"` + +## Documentation + +Use markdown roxygen for function and class documentation. +Use `R -e "roxygen2::roxygenize()"` to synchronize roxygen comments and `.Rd` files. +Writte tutorials and how-to as quarto files in `vignettes/*.qmd`. + +## Rcpp components + +C++ code is in the `src/` folder. +This code interfaces with R using Rcpp. +Build binaries using `pkgbuild::build()`. +Clean build objects using `pkgbuild::clean_dll()`. +Prefer `Rcpp` types. +Only write Rcpp-free headers when the code should also compile as a standalone program. + +## Database + +- Storage is done in parquet files written and read via an in-memory DuckDB instance. +- `R/parquet_db.R` specifies an R6 class that provides database operations to write and retrieve `data.table` objects to parquet files. + - `parquet_db_t` is a subclass of the `data.table` S3 class, see `R/parquet_db_utils.R` + - `parquet_db_t` objects can hold attributes used to define + - key columns, i.e. uniqueness columns + - hive partitioning columns + - map columns, i.e. R list columns of named lists translated to DuckDB MAP columns +- Domain specific database elements are in `R/evoland_db.R`; `evoland_db` inherits from `parquet_db`. + - The schema for this database is (for now) distributed across the class definitions: all `R/*_t.R` files contain `as_*_t` class constructors using `as_parquet_db_t`. + - Because the parquet files may be written to from external tools, they should be considered part of the API. Schema changes should be avoided as much as possible. + - Ad-hoc views are suffixed `_v` and generally exposed as active bindings, or as methods if they are parameterized. + - Every new method must be added in `R/evoland_db.R` and _not_ with `$set`. This is because the roxygen documentation routine for R6 objects relies on all documentation being available in a single file. diff --git a/rproject.toml b/rproject.toml new file mode 100644 index 0000000..e21002e --- /dev/null +++ b/rproject.toml @@ -0,0 +1,16 @@ +[project] +name = "evoland-plus" +r_version = "4.5" + +repositories = [ + { alias = "CRAN", url = "https://stat.ethz.ch/CRAN/" }, +] + +dependencies = [ + { name = "evoland", path = ".", install_suggestions = true }, + # dev dependencies + "devtools", + "mirai", + "httpgd", + "languageserver", +] From 8db92eefec51f6d184805cf6091d5cf8a4017e14 Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Tue, 21 Apr 2026 19:59:10 +0200 Subject: [PATCH 08/15] slightly edit comments --- R/trans_models_t.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/trans_models_t.R b/R/trans_models_t.R index 1ec4bfc..1c8dfe6 100644 --- a/R/trans_models_t.R +++ b/R/trans_models_t.R @@ -108,11 +108,12 @@ fit_partial_model_worker <- function( sample(idx_false, n_train_false) ) - # Subset to task columns (did_transition + predictors) and coerce target + # Subset to task columns (did_transition + predictors) task_cols <- c("did_transition", pred_cols) train_data <- trans_pred_data_full[train_idx, .SD, .SDcols = task_cols] test_data <- trans_pred_data_full[-train_idx, .SD, .SDcols = task_cols] + # Coerce target; mlr3 uses factors internally also for twoclass classification train_data[, did_transition := factor(did_transition, levels = c("FALSE", "TRUE"))] test_data[, did_transition := factor(did_transition, levels = c("FALSE", "TRUE"))] From 98394024f3e36c90c33c2fc1a5934080a983d380 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 21 Apr 2026 18:41:29 +0000 Subject: [PATCH 09/15] Refactor fit_full_models: two modes, select_score/select_maximize, simplified fetch; fix docs/guards/tests Agent-Logs-Url: https://github.com/ethzplus/evoland-plus/sessions/44f0fa02-3b76-45d5-a99b-1d20977bcc9e Co-authored-by: mmyrte <24587121+mmyrte@users.noreply.github.com> --- R/evoland_db.R | 22 +- R/trans_models_t.R | 318 ++++++++++++---------- inst/tinytest/test_integ_allocation.R | 30 +- inst/tinytest/test_integ_trans_models_t.R | 76 ++++-- 4 files changed, 264 insertions(+), 182 deletions(-) diff --git a/R/evoland_db.R b/R/evoland_db.R index ba0064d..0693518 100644 --- a/R/evoland_db.R +++ b/R/evoland_db.R @@ -171,19 +171,19 @@ evoland_db <- R6::R6Class( }, #' @description - #' Fit full models on complete data using the best partial model configuration for - #' each transition, see [fit_full_models()] - #' @param learner An mlr3 `Learner` or `AutoTuner` object; used as last-resort - #' fallback for reconstruction. - #' @param measures A list of mlr3 `Measure` objects; kept for API consistency. - #' @param gof_criterion Which cross-validation measure to use for model selection (e.g., `"classif.auc"`) - #' @param gof_maximize Maximize (TRUE) or minimize (FALSE) the gof_criterion? + #' Fit full models (trained on the complete dataset) for each viable transition, + #' see [fit_full_models()]. Two mutually exclusive modes: pass `learner` to train + #' directly, or pass `select_score` to pick the best partial model by score. + #' @param learner An mlr3 `Learner` or `AutoTuner` for direct-learner mode (`NULL` + #' when `select_score` is used). + #' @param select_score Measure ID string for score-select mode, e.g. `"classif.auc"` + #' (`NULL` when `learner` is used). + #' @param select_maximize Logical; maximize (`TRUE`) or minimize (`FALSE`) the score. #' @param cluster Optional cluster object for parallel processing fit_full_models = function( - learner, - measures, - gof_criterion, - gof_maximize, + learner = NULL, + select_score = NULL, + select_maximize = TRUE, cluster = NULL ) { create_method_binding(fit_full_models) diff --git a/R/trans_models_t.R b/R/trans_models_t.R index 1c8dfe6..b42421e 100644 --- a/R/trans_models_t.R +++ b/R/trans_models_t.R @@ -11,7 +11,8 @@ #' @return A data.table of class "trans_models_t" with columns: #' - `id_run`: Foreign key to runs_t #' - `id_trans`: Foreign key to trans_meta_t -#' - `learner_id`: mlr3 learner key, e.g. `"classif.ranger"` +#' - `learner_id`: mlr3 twoclass [LearnerClassif](https://mlr3.mlr-org.com/reference/LearnerClassif.html) +#' key, e.g. `"classif.ranger"` #' - `learner_params`: MAP of atomic scalar learner hyperparameters for #' querying; complete hyperparameters are captured by `learner_spec` #' - `learner_spec`: BLOB of serialized untrained mlr3 `Learner`; for @@ -60,10 +61,6 @@ fit_partial_model_worker <- function( seed = NULL, sample_frac = 0.7 ) { - if (!requireNamespace("mlr3", quietly = TRUE)) { - stop("Package 'mlr3' is required. Install with: install.packages('mlr3')") - } - id_run_orig <- db$id_run on.exit(db$id_run <- id_run_orig, add = TRUE) db$id_run <- item[["id_run"]] @@ -178,12 +175,11 @@ fit_partial_model_worker <- function( } # Worker function for full model fitting -# Not exported; used internally by fit_full_models -fit_full_model_worker <- function(item, db, ...) { - if (!requireNamespace("mlr3", quietly = TRUE)) { - stop("Package 'mlr3' is required. Install with: install.packages('mlr3')") - } - +# Not exported; used internally by fit_full_models. +# Operates in two modes depending on whether `learner` is NULL: +# - direct mode (learner != NULL): train the passed learner clone on full data +# - score-select mode (learner == NULL): reconstruct from item$learner_spec and retrain +fit_full_model_worker <- function(item, db, learner = NULL, ...) { tryCatch( { # Fetch full data @@ -209,22 +205,39 @@ fit_full_model_worker <- function(item, db, ...) { positive = "TRUE" ) - # Reconstruct learner: try learner_spec first, fall back to do.call(lrn, ...) - learner_spec_raw <- item[["learner_spec"]][[1L]] - learner_id_val <- item[["learner_id"]] - learner_params_val <- item[["learner_params"]][[1L]] - - trained_learner <- tryCatch( - qs2::qs_deserialize(learner_spec_raw), - error = function(e) { - fallback <- do.call(mlr3::lrn, c(list(learner_id_val), as.list(learner_params_val))) - warning(glue::glue( - "learner_spec deserialization failed for {learner_id_val}: {e$message}; ", - "falling back to reconstructed learner: {paste(fallback$format(), collapse = ' ')}" - )) - fallback - } - ) + if (!is.null(learner)) { + # Direct mode: use the passed learner directly + trained_learner <- learner$clone(deep = TRUE) + learner_id_val <- trained_learner$id + learner_params_val <- Filter( + function(v) is.atomic(v) && length(v) == 1L, + trained_learner$param_set$values + ) + learner_params_val <- if (length(learner_params_val) == 0L) NULL else learner_params_val + learner_spec_blob <- qs2::qs_serialize(trained_learner$clone(deep = TRUE)$reset()) + crossval_score_val <- list(NULL) + crossval_predictions_val <- list(NULL) + } else { + # Score-select mode: reconstruct from learner_spec; fall back to do.call + learner_spec_raw <- item[["learner_spec"]][[1L]] + learner_id_val <- item[["learner_id"]] + learner_params_val <- item[["learner_params"]][[1L]] + trained_learner <- tryCatch( + qs2::qs_deserialize(learner_spec_raw), + error = function(e) { + fallback <- do.call(mlr3::lrn, c(list(learner_id_val), as.list(learner_params_val))) + warning(glue::glue( + "learner_spec deserialization failed for {learner_id_val}: {e$message}; ", + "falling back to reconstructed learner: {paste(fallback$format(), collapse = ' ')}" + )) + fallback + } + ) + learner_spec_blob <- learner_spec_raw + # Pass through cross-validation results from the partial model + crossval_score_val <- item[["crossval_score"]] + crossval_predictions_val <- item[["crossval_predictions"]] + } trained_learner$train(full_task) @@ -232,6 +245,10 @@ fit_full_model_worker <- function(item, db, ...) { id_run = item[["id_run"]], id_trans = item[["id_trans"]], learner_id = learner_id_val, + learner_params = list(learner_params_val), + learner_spec = list(learner_spec_blob), + crossval_score = crossval_score_val, + crossval_predictions = crossval_predictions_val, learner_full = list(qs2::qs_serialize(trained_learner)) ) }, @@ -242,7 +259,11 @@ fit_full_model_worker <- function(item, db, ...) { list( id_run = item[["id_run"]], id_trans = item[["id_trans"]], - learner_id = item[["learner_id"]], + learner_id = if (!is.null(learner)) learner$id else item[["learner_id"]], + learner_params = list(NULL), + learner_spec = list(NULL), + crossval_score = list(NULL), + crossval_predictions = list(NULL), learner_full = list(NULL) ) } @@ -251,7 +272,9 @@ fit_full_model_worker <- function(item, db, ...) { #' @describeIn trans_models_t Fit partial (cross-validation) models for each viable -#' transition and store results in a trans_models_t table. +#' transition; returns a [trans_models_t] object with one row per viable transition, +#' containing the learner identity, serialized spec, cross-validation scores +#' (`crossval_score`), and serialized held-out predictions (`crossval_predictions`). #' @param self [evoland_db] instance to query for transitions and predictor data #' @param learner An mlr3 `Learner` or `AutoTuner` object. A deep clone is trained #' for each transition; the original object is not modified. For `AutoTuner`, @@ -267,9 +290,6 @@ fit_full_model_worker <- function(item, db, ...) { #' @param seed Optional integer seed for reproducible subsampling. #' @param cluster An optional cluster object created by [parallel::makeCluster()] or #' [mirai::make_cluster()]. -#' @return A [trans_models_t] table with one row per viable transition, containing -#' the learner identity, serialized spec, cross-validation scores (`crossval_score`), -#' and serialized held-out predictions (`crossval_predictions`). fit_partial_models <- function( self, learner, @@ -330,131 +350,151 @@ fit_partial_models <- function( as_trans_models_t() } -#' @describeIn trans_models_t Fit full models for each transition based on the best -#' partial model according to a specified cross-validation criterion. +#' @describeIn trans_models_t Fit full models (trained on the complete dataset) for each +#' viable transition and return a [trans_models_t] object with `learner_full` populated. +#' Two mutually exclusive modes are supported: +#' - **Direct-learner mode** (`learner` provided, `select_score` omitted): a fresh clone of +#' `learner` is trained on the full data for each transition. `crossval_score` and +#' `crossval_predictions` will be `NULL` in the result. Does not require a prior +#' call to [fit_partial_models()]. +#' - **Score-select mode** (`select_score` provided, `learner` omitted): selects the best +#' partial model per transition by `select_score`, reconstructs its learner from +#' `learner_spec`, and retrains on the full data. Requires [fit_partial_models()] to +#' have been run first. #' @param self [evoland_db] instance to query for transitions and predictor data -#' @param learner An mlr3 `Learner` or `AutoTuner` object; kept for API consistency and -#' used as a last-resort fallback if both `learner_spec` deserialization and -#' `do.call(mlr3::lrn, ...)` reconstruction fail. -#' @param measures Either a character vector of mlr3 measure IDs or a list of `Measure` -#' objects; kept for API consistency. -#' @param gof_criterion Character string specifying which cross-validation score to use -#' for selecting the best partial model per transition (must match a key in -#' `crossval_score`, e.g. `"classif.auc"`). -#' @param gof_maximize Logical; select the model with the maximum (`TRUE`) or minimum -#' (`FALSE`) value of `gof_criterion`. Default is `TRUE`. +#' @param learner An mlr3 `Learner` or `AutoTuner` object for direct-learner mode. +#' Must be `NULL` when `select_score` is provided. +#' @param select_score Character string; mlr3 measure ID (e.g. `"classif.auc"`) used to +#' rank partial models in score-select mode. Must be `NULL` when `learner` is provided. +#' @param select_maximize Logical; if `TRUE` (default) the model with the highest +#' `select_score` is selected; if `FALSE`, the lowest. Only used in score-select mode. #' @param cluster An optional cluster object created by [parallel::makeCluster()] or #' [mirai::make_cluster()]. -#' @return A [trans_models_t] table with one row per transition, containing the columns -#' from the best partial model plus `learner_full` with the serialized fully-trained -#' learner. fit_full_models <- function( self, - learner, - measures, - gof_criterion, - gof_maximize, + learner = NULL, + select_score = NULL, + select_maximize = TRUE, cluster = NULL ) { + has_learner <- !is.null(learner) + has_score <- !is.null(select_score) + stopifnot( - "gof_criterion must be a character string" = is.character(gof_criterion) && - length(gof_criterion) == 1L, - "gof_maximize must be set to TRUE or FALSE" = isTRUE(gof_maximize) || isFALSE(gof_maximize), - "trans_models_t is missing" = file.exists(self$get_table_path("trans_models_t")) + "Provide exactly one of 'learner' or 'select_score'" = xor(has_learner, has_score) ) - # Get the best partial model per transition (scalar columns only; MAP/BLOB via fetch below) - best_model_ids <- self$get_query(glue::glue( - r"[ - with preds_nested as ( - select - id_run, - id_trans, - list(id_pred) as id_pred - from - {self$get_read_expr("trans_preds_t")} - group by - id_run, id_trans + if (has_learner) { + stopifnot( + "learner must be an mlr3 Learner or AutoTuner" = inherits(learner, "Learner") ) - select - tm.id_run, - tm.id_trans, - tm.learner_id, - pn.id_pred, - from - {self$get_read_expr("trans_models_t")} tm, - preds_nested pn - where - pn.id_run = tm.id_run - and pn.id_trans = tm.id_trans - qualify row_number() over ( - partition by tm.id_run, tm.id_trans - order by tm.crossval_score['{gof_criterion}'] {ifelse(gof_maximize, "desc", "asc")} - ) = 1; - ]" - )) - # Fetch learner_spec (BLOB) and learner_params (MAP) for the best rows via fetch() - # so that MAP columns are properly deserialized to named lists. - learner_id_csv <- paste0("'", best_model_ids$learner_id, "'", collapse = ", ") - best_specs <- self$fetch( - "trans_models_t", - cols = c("id_run", "id_trans", "learner_id", "learner_spec", "learner_params"), - where = glue::glue( - "id_run in ({toString(best_model_ids$id_run)}) and ", - "id_trans in ({toString(best_model_ids$id_trans)}) and ", - "learner_id in ({learner_id_csv})" + # Direct mode: get viable transitions with their predictor lists + trans_preds_nested <- + data.table::as.data.table(self$trans_preds_t)[, + .(id_pred = list(id_pred)), + by = .(id_run, id_trans) + ] + + viable_trans <- + self$trans_meta_t[ + is_viable == TRUE, + .(id_trans) + ][ + trans_preds_nested, + on = "id_trans" + ] + + message(glue::glue( + "Fitting full models for {nrow(viable_trans)} transitions..." + )) + + viable_trans |> + split(by = c("id_run", "id_trans")) |> + run_parallel_evoland( + items = _, + worker_fun = fit_full_model_worker, + parent_db = self, + cluster = cluster, + learner = learner, + ) |> + data.table::rbindlist() |> + as_trans_models_t() + } else { + # Score-select mode + stopifnot( + "select_score must be a character string" = is.character(select_score) && + length(select_score) == 1L, + "select_maximize must be TRUE or FALSE" = isTRUE(select_maximize) || isFALSE(select_maximize), + "trans_models_t is missing" = file.exists(self$get_table_path("trans_models_t")) ) - ) - # Join to add id_pred and build complete item list for workers - best_models <- best_model_ids[best_specs, on = c("id_run", "id_trans", "learner_id")] + # Identify the best partial model per transition (using QUALIFY window function) + # and get the predictor ID lists from trans_preds_t in the same query + best_model_ids <- self$get_query(glue::glue( + r"[ + with preds_nested as ( + select + id_run, + id_trans, + list(id_pred) as id_pred + from + {self$get_read_expr("trans_preds_t")} + group by + id_run, id_trans + ) + select + tm.id_run, + tm.id_trans, + tm.learner_id, + pn.id_pred, + from + {self$get_read_expr("trans_models_t")} tm, + preds_nested pn + where + pn.id_run = tm.id_run + and pn.id_trans = tm.id_trans + qualify row_number() over ( + partition by tm.id_run, tm.id_trans + order by tm.crossval_score['{select_score}'] {ifelse(select_maximize, "desc", "asc")} + ) = 1; + ]" + )) - message(glue::glue( - "Fitting full models for {nrow(best_models)} transitions..." - )) + # Fetch all trans_models_t columns (including MAP/BLOB) for the best rows in one shot + learner_id_csv <- paste0("'", best_model_ids$learner_id, "'", collapse = ", ") + best_specs <- self$fetch( + "trans_models_t", + cols = c( + "id_run", "id_trans", "learner_id", + "learner_spec", "learner_params", + "crossval_score", "crossval_predictions" + ), + where = glue::glue( + "id_run in ({toString(best_model_ids$id_run)}) and ", + "id_trans in ({toString(best_model_ids$id_trans)}) and ", + "learner_id in ({learner_id_csv})" + ) + ) - full_models <- - best_models |> - split(by = c("id_run", "id_trans")) |> - run_parallel_evoland( - items = _, - worker_fun = fit_full_model_worker, - parent_db = self, - cluster = cluster, - ) |> - data.table::rbindlist() + # Join to add id_pred from the ranking query + best_models <- best_model_ids[best_specs, on = c("id_run", "id_trans", "learner_id")] - # Fetch remaining columns from the best partial models and join - partial_models <- self$fetch( - "trans_models_t", - cols = c( - "id_run", - "id_trans", - "learner_id", - "learner_params", - "learner_spec", - "crossval_score", - "crossval_predictions" - ), - where = glue::glue( - "id_run in ({toString(full_models$id_run)}) and ", - "id_trans in ({toString(full_models$id_trans)}) and ", - "learner_id in ({paste0(\"'\", full_models$learner_id, \"'\", collapse = \", \")})" - ) - ) + message(glue::glue( + "Fitting full models for {nrow(best_models)} transitions..." + )) - full_models[ - partial_models, - on = c("id_run", "id_trans", "learner_id"), - `:=`( - learner_params = i.learner_params, - learner_spec = i.learner_spec, - crossval_score = i.crossval_score, - crossval_predictions = i.crossval_predictions - ) - ] |> - as_trans_models_t() + best_models |> + split(by = c("id_run", "id_trans")) |> + run_parallel_evoland( + items = _, + worker_fun = fit_full_model_worker, + parent_db = self, + cluster = cluster, + ) |> + data.table::rbindlist() |> + as_trans_models_t() + } } diff --git a/inst/tinytest/test_integ_allocation.R b/inst/tinytest/test_integ_allocation.R index 0dd1429..2594ab7 100644 --- a/inst/tinytest/test_integ_allocation.R +++ b/inst/tinytest/test_integ_allocation.R @@ -5,10 +5,6 @@ if (!at_home()) { exit_file("Integration tests skipped (not at_home)") } -if (!requireNamespace("mlr3", quietly = TRUE)) { - exit_file("mlr3 not available; skipping allocation integration tests") -} - source(file.path(system.file("tinytest", package = "evoland"), "helper_testdb.R")) db <- make_test_db() db$trans_rates_t <- db$get_obs_trans_rates() @@ -19,7 +15,7 @@ db$trans_rates_t <- extrapolate_trans_rates( ) test_learner <- mlr3::lrn("classif.featureless", predict_type = "prob") -test_measures <- list(mlr3::msr("classif.auc")) +test_measures <- c("classif.auc") # test the package's featureless learner fit and append to disk expect_message( @@ -30,15 +26,31 @@ expect_message( ), "Fitting partial models for 2 transitions..." ) + +# Score-select mode: pick best partial model by classif.auc and retrain on full data expect_message( db$trans_models_t <- db$fit_full_models( - learner = test_learner, - measures = test_measures, - gof_criterion = "classif.auc", - gof_maximize = TRUE + select_score = "classif.auc", + select_maximize = TRUE + ), + "Fitting full models for" +) + +# Direct-learner mode: train a fresh learner on full data without cross-validation +db_direct <- make_test_db() +db_direct$trans_rates_t <- db_direct$get_obs_trans_rates() +db_direct$trans_rates_t <- extrapolate_trans_rates( + db_direct$trans_rates_t, + db_direct$periods_t, + coord_count = nrow(db_direct$coords_t) +) +expect_message( + full_direct <- db_direct$fit_full_models( + learner = test_learner ), "Fitting full models for" ) +expect_true(all(vapply(full_direct$learner_full, is.raw, logical(1)))) # no data for period 4 yet expect_equal(nrow(db$fetch("lulc_data_t", where = "id_period = 4")), 0L) diff --git a/inst/tinytest/test_integ_trans_models_t.R b/inst/tinytest/test_integ_trans_models_t.R index 5600065..53b3935 100644 --- a/inst/tinytest/test_integ_trans_models_t.R +++ b/inst/tinytest/test_integ_trans_models_t.R @@ -32,10 +32,6 @@ if (!at_home()) { exit_file("Integration tests skipped (not at_home)") } -if (!requireNamespace("mlr3", quietly = TRUE)) { - exit_file("mlr3 not available; skipping integration tests") -} - # Load fixtures via helper source(file.path( system.file("tinytest", package = "evoland"), @@ -101,13 +97,11 @@ expect_true(all(vapply( logical(1) ))) -# Test fit_full_models, which reconstructs and retrains on full data +# Test fit_full_models in score-select mode (picks best partial model by crossval_score) expect_message( db$trans_models_t <- full_models <- db$fit_full_models( - learner = test_learner, - measures = test_measures, - gof_criterion = "classif.auc", - gof_maximize = TRUE + select_score = "classif.auc", + select_maximize = TRUE ), "Fitting full models for" ) @@ -133,10 +127,8 @@ expect_false(is.null(deserialized_full$model)) # Test model selection with minimize criterion expect_message( full_models_min <- db$fit_full_models( - learner = test_learner, - measures = test_measures, - gof_criterion = "classif.acc", - gof_maximize = FALSE + select_score = "classif.acc", + select_maximize = FALSE ), "Fitting full models for" ) @@ -178,14 +170,12 @@ expect_error( "sample_frac must be between 0 and 1" ) -# Test error handling - missing trans_models_t for full model fitting +# Test error handling - missing trans_models_t for full model fitting (score-select mode) db$delete_from("trans_models_t") expect_error( db$fit_full_models( - learner = test_learner, - measures = test_measures, - gof_criterion = "classif.auc", - gof_maximize = TRUE + select_score = "classif.auc", + select_maximize = TRUE ), "trans_models_t is missing" ) @@ -217,7 +207,7 @@ expect_equal( "error" ) -# Test print method +# Test print method (score-select mode) db3 <- make_test_db(include_neighbors = FALSE, include_trans_preds = TRUE) db3$trans_models_t <- db3$fit_partial_models( learner = test_learner, @@ -225,12 +215,52 @@ db3$trans_models_t <- db3$fit_partial_models( seed = 42 ) db3$trans_models_t <- db3$fit_full_models( - learner = test_learner, - measures = test_measures, - gof_criterion = "classif.auc", - gof_maximize = TRUE + select_score = "classif.auc", + select_maximize = TRUE ) expect_stdout( print(db3$trans_models_t), "Transition Models Table|Total models" ) + +# Test direct-learner mode: fit_full_models with a learner (no partial models needed) +db4 <- make_test_db(include_neighbors = FALSE, include_trans_preds = TRUE) +expect_message( + full_models_direct <- db4$fit_full_models( + learner = test_learner + ), + "Fitting full models for" +) +# direct mode: crossval_score and crossval_predictions should be NULL +expect_true(all(vapply(full_models_direct$crossval_score, is.null, logical(1)))) +expect_true(all(vapply(full_models_direct$crossval_predictions, is.null, logical(1)))) +# learner_full should be populated +expect_true(all(vapply(full_models_direct$learner_full, is.raw, logical(1)))) +deserialized_direct <- qs2::qs_deserialize(full_models_direct$learner_full[[1]]) +expect_true(inherits(deserialized_direct, "Learner")) +expect_false(is.null(deserialized_direct$model)) + +# Test get_crossval_plots (requires mlr3viz) +if (!requireNamespace("mlr3viz", quietly = TRUE)) { + exit_file("mlr3viz not available; skipping get_crossval_plots tests") +} + +db5 <- make_test_db(include_neighbors = FALSE, include_trans_preds = TRUE) +db5$trans_models_t <- db5$fit_partial_models( + learner = test_learner, + measures = test_measures, + seed = 42 +) + +plots <- db5$get_crossval_plots() +expect_true(is.list(plots)) +expect_equal(length(plots), nrow(db5$trans_models_t)) +# Each element should be a ggplot (or NULL for rows without predictions) +non_null_plots <- Filter(Negate(is.null), plots) +expect_true(length(non_null_plots) > 0L) +expect_true(all(vapply(non_null_plots, inherits, logical(1), "gg"))) + +# Filter by id_trans +plots_filtered <- db5$get_crossval_plots(id_trans = db5$trans_models_t$id_trans[[1]]) +expect_true(is.list(plots_filtered)) +expect_equal(length(plots_filtered), 1L) From ee71350cc8b0ccf1cc6c99689eb285943e038b1f Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Fri, 24 Apr 2026 16:53:22 +0200 Subject: [PATCH 10/15] better mlr3 tests; caught warning-inducing edge cases --- R/parquet_db.R | 3 + R/parquet_db_utils.R | 21 ++-- R/trans_models_t.R | 20 ++-- inst/tinytest/test_integ_allocation.R | 16 --- inst/tinytest/test_integ_trans_models_t.R | 132 +++++++++------------- man/evoland_db.Rd | 52 ++++++--- man/parquet_db_utils.Rd | 3 +- man/trans_models_t.Rd | 90 ++++++++++----- 8 files changed, 179 insertions(+), 158 deletions(-) diff --git a/R/parquet_db.R b/R/parquet_db.R index ceae80b..bc33530 100644 --- a/R/parquet_db.R +++ b/R/parquet_db.R @@ -149,6 +149,9 @@ parquet_db <- R6::R6Class( metadata <- private$read_parquet_metadata(table_path) map_cols <- resolve_cols(NULL, metadata, "map_cols") + if (!is.null(cols)) { + map_cols <- intersect(cols, map_cols) + } read_expr <- self$get_read_expr(table_name) # build sql query diff --git a/R/parquet_db_utils.R b/R/parquet_db_utils.R index 63d3306..6d367dc 100644 --- a/R/parquet_db_utils.R +++ b/R/parquet_db_utils.R @@ -104,13 +104,17 @@ validate.parquet_db_t <- function(x, ...) { } } else if (col %in% attr(x, "map_cols")) { for (val in x[[col]]) { - if ( - !is.null(val) && # if not NULL any of the following being true is an error - (!is.list(val) || # if it's a list - is.null(names(val)) || # or names missing - any(vapply(val, Negate(is.atomic), logical(1)))) # or any element not atomic - ) { - # then throw error + is_valid_mapcol <- function(v) { + # allow NULLs + is.null(v) || + # allow empty lists + (is.list(v) && length(v) == 0) || + # allow named lists with atomic values + (is.list(v) && !is.null(names(v)) && all(vapply(v, is.atomic, logical(1)))) + } + + # negation makes the above condition easier to read + if (Negate(is_valid_mapcol)(val)) { stop(glue::glue( "Column '{col}' specified as map_cols must be a list of ", "named lists with atomic values" @@ -304,7 +308,8 @@ kv_df_to_list <- function(x) { #' (missing argument), or upserts to it (assignment operation) #' @param table_name The name of the table to bind to. #' @param mode The mode of the binding, which determines the behavior when -#' committing data. Options are: "write_once" (default, only allows writing if table doesn't exist), "upsert" +#' committing data. Options are: "write_once" (default, only allows writing if +#' table doesn't exist), "upsert", "append", and "overwrite". create_table_binding <- function( table_name, mode = c("write_once", "upsert", "append", "overwrite") diff --git a/R/trans_models_t.R b/R/trans_models_t.R index b42421e..3e62451 100644 --- a/R/trans_models_t.R +++ b/R/trans_models_t.R @@ -164,9 +164,9 @@ fit_partial_model_worker <- function( id_run = item[["id_run"]], id_trans = item[["id_trans"]], learner_id = "error", - learner_params = list(NULL), + learner_params = list(list()), learner_spec = list(NULL), - crossval_score = list(NULL), + crossval_score = list(list()), crossval_predictions = list(NULL), learner_full = list(NULL) ) @@ -215,7 +215,7 @@ fit_full_model_worker <- function(item, db, learner = NULL, ...) { ) learner_params_val <- if (length(learner_params_val) == 0L) NULL else learner_params_val learner_spec_blob <- qs2::qs_serialize(trained_learner$clone(deep = TRUE)$reset()) - crossval_score_val <- list(NULL) + crossval_score_val <- list(list()) crossval_predictions_val <- list(NULL) } else { # Score-select mode: reconstruct from learner_spec; fall back to do.call @@ -260,9 +260,9 @@ fit_full_model_worker <- function(item, db, learner = NULL, ...) { id_run = item[["id_run"]], id_trans = item[["id_trans"]], learner_id = if (!is.null(learner)) learner$id else item[["learner_id"]], - learner_params = list(NULL), + learner_params = list(list()), learner_spec = list(NULL), - crossval_score = list(NULL), + crossval_score = list(list()), crossval_predictions = list(NULL), learner_full = list(NULL) ) @@ -466,9 +466,13 @@ fit_full_models <- function( best_specs <- self$fetch( "trans_models_t", cols = c( - "id_run", "id_trans", "learner_id", - "learner_spec", "learner_params", - "crossval_score", "crossval_predictions" + "id_run", + "id_trans", + "learner_id", + "learner_spec", + "learner_params", + "crossval_score", + "crossval_predictions" ), where = glue::glue( "id_run in ({toString(best_model_ids$id_run)}) and ", diff --git a/inst/tinytest/test_integ_allocation.R b/inst/tinytest/test_integ_allocation.R index 2594ab7..ff9925a 100644 --- a/inst/tinytest/test_integ_allocation.R +++ b/inst/tinytest/test_integ_allocation.R @@ -36,22 +36,6 @@ expect_message( "Fitting full models for" ) -# Direct-learner mode: train a fresh learner on full data without cross-validation -db_direct <- make_test_db() -db_direct$trans_rates_t <- db_direct$get_obs_trans_rates() -db_direct$trans_rates_t <- extrapolate_trans_rates( - db_direct$trans_rates_t, - db_direct$periods_t, - coord_count = nrow(db_direct$coords_t) -) -expect_message( - full_direct <- db_direct$fit_full_models( - learner = test_learner - ), - "Fitting full models for" -) -expect_true(all(vapply(full_direct$learner_full, is.raw, logical(1)))) - # no data for period 4 yet expect_equal(nrow(db$fetch("lulc_data_t", where = "id_period = 4")), 0L) diff --git a/inst/tinytest/test_integ_trans_models_t.R b/inst/tinytest/test_integ_trans_models_t.R index 53b3935..032bd90 100644 --- a/inst/tinytest/test_integ_trans_models_t.R +++ b/inst/tinytest/test_integ_trans_models_t.R @@ -43,7 +43,6 @@ db <- make_test_db(include_neighbors = FALSE, include_trans_preds = TRUE) test_learner <- mlr3::lrn("classif.featureless", predict_type = "prob") # measures can be passed as a character vector of IDs (convenience) or as a list of Measure objects test_measures <- c("classif.auc", "classif.acc") - # Test fit_partial_models expect_message( db$trans_models_t <- partial_models <- db$fit_partial_models( @@ -54,27 +53,14 @@ expect_message( ), "Fitting partial models for 2 transitions..." ) -expect_length( - partial_models, - 8L # columns -) -expect_true(all( - c( - "id_run", - "id_trans", - "learner_id", - "learner_params", - "learner_spec", - "crossval_score", - "crossval_predictions", - "learner_full" - ) %in% - names(partial_models) -)) expect_equal( - unique(partial_models$learner_id), - "classif.featureless" + partial_models[["crossval_score"]], + list( + list(classif.auc = 0.5, classif.acc = 0.547945205479452), + list(classif.auc = 0.5, classif.acc = 0.536050156739812) + ) ) +expect_equal(partial_models$learner_id[1], "classif.featureless") expect_true(all( vapply(partial_models$learner_spec, is.raw, logical(1)) )) @@ -90,13 +76,6 @@ deserialized_spec <- qs2::qs_deserialize(partial_models$learner_spec[[1]]) expect_true(inherits(deserialized_spec, "Learner")) expect_equal(deserialized_spec$id, "classif.featureless") -# crossval_score should be named lists with measure IDs as keys -expect_true(all(vapply( - partial_models$crossval_score, - function(m) !is.null(m) && is.list(m) && "classif.auc" %in% names(m), - logical(1) -))) - # Test fit_full_models in score-select mode (picks best partial model by crossval_score) expect_message( db$trans_models_t <- full_models <- db$fit_full_models( @@ -180,13 +159,8 @@ expect_error( "trans_models_t is missing" ) -# Test fit function that throws an error (simulate via a bad learner) -# We achieve this by running with a learner that fails (no viable data after -# deleting the table above means we re-populate and then use a bad setup). -# Instead, test via a mock that warns on error using an out-of-range sample_frac -# -> use a real error scenario by removing all preds -db2 <- make_test_db(include_neighbors = FALSE, include_trans_preds = FALSE) -db2$trans_preds_t <- as_trans_preds_t(data.table::data.table( +# Test fit function that throws an error: overwrite trans_preds_t +db$trans_preds_t <- as_trans_preds_t(data.table::data.table( id_run = 0L, id_pred = 99999L, # non-existent predictor id_trans = 1L @@ -194,73 +168,73 @@ db2$trans_preds_t <- as_trans_preds_t(data.table::data.table( expect_warning( partial_models_error <- - db2$fit_partial_models( + db$fit_partial_models( learner = test_learner, measures = test_measures, - sample_frac = 0.7, - seed = 123 + sample_frac = 0.7 ), "No predictor columns|No data" ) -expect_equal( - partial_models_error$learner_id, - "error" -) +expect_equal(partial_models_error$learner_id, "error") -# Test print method (score-select mode) -db3 <- make_test_db(include_neighbors = FALSE, include_trans_preds = TRUE) -db3$trans_models_t <- db3$fit_partial_models( - learner = test_learner, - measures = test_measures, - seed = 42 -) -db3$trans_models_t <- db3$fit_full_models( - select_score = "classif.auc", - select_maximize = TRUE -) -expect_stdout( - print(db3$trans_models_t), - "Transition Models Table|Total models" +# Test direct-learner mode: fit_full_models with a learner +db$set_full_trans_preds() +expect_message( + db$trans_models_t <- full_models_direct <- db$fit_full_models(learner = test_learner), + "Fitting full models for" ) +# direct mode: crossval_score and crossval_predictions should be length 0 +expect_true(all(vapply(full_models_direct$crossval_score, length, integer(1)) == 0L)) +expect_true(all(vapply(full_models_direct$crossval_predictions, length, integer(1)) == 0L)) +# learner_full should be populated +expect_true(all(vapply(full_models_direct$learner_full, is.raw, logical(1)))) +deserialized_direct <- qs2::qs_deserialize(full_models_direct$learner_full[[1]])$reset() +expect_equal(deserialized_direct, test_learner) -# Test direct-learner mode: fit_full_models with a learner (no partial models needed) -db4 <- make_test_db(include_neighbors = FALSE, include_trans_preds = TRUE) expect_message( - full_models_direct <- db4$fit_full_models( - learner = test_learner - ), + db$trans_models_t <- full_models_direct <- db$fit_full_models(learner = test_learner), "Fitting full models for" ) -# direct mode: crossval_score and crossval_predictions should be NULL -expect_true(all(vapply(full_models_direct$crossval_score, is.null, logical(1)))) -expect_true(all(vapply(full_models_direct$crossval_predictions, is.null, logical(1)))) +# direct mode: crossval_score and crossval_predictions should be length 0 +expect_true(all(vapply(full_models_direct$crossval_score, length, integer(1)) == 0L)) +expect_true(all(vapply(full_models_direct$crossval_predictions, length, integer(1)) == 0L)) # learner_full should be populated expect_true(all(vapply(full_models_direct$learner_full, is.raw, logical(1)))) -deserialized_direct <- qs2::qs_deserialize(full_models_direct$learner_full[[1]]) -expect_true(inherits(deserialized_direct, "Learner")) -expect_false(is.null(deserialized_direct$model)) +deserialized_direct <- qs2::qs_deserialize(full_models_direct$learner_full[[1]])$reset() +expect_equal(deserialized_direct, test_learner) # Test get_crossval_plots (requires mlr3viz) if (!requireNamespace("mlr3viz", quietly = TRUE)) { exit_file("mlr3viz not available; skipping get_crossval_plots tests") } -db5 <- make_test_db(include_neighbors = FALSE, include_trans_preds = TRUE) -db5$trans_models_t <- db5$fit_partial_models( - learner = test_learner, - measures = test_measures, - seed = 42 +expect_message( + db$trans_models_t <- db$fit_partial_models( + learner = test_learner, + measures = test_measures, + seed = 42 + ), + "Fitting partial models for 2 transitions..." ) -plots <- db5$get_crossval_plots() +plots <- db$get_crossval_plots() expect_true(is.list(plots)) -expect_equal(length(plots), nrow(db5$trans_models_t)) -# Each element should be a ggplot (or NULL for rows without predictions) -non_null_plots <- Filter(Negate(is.null), plots) -expect_true(length(non_null_plots) > 0L) -expect_true(all(vapply(non_null_plots, inherits, logical(1), "gg"))) +expect_equal(length(plots), nrow(db$trans_models_t)) +expect_true(all(vapply(plots, inherits, logical(1), "gg"))) # Filter by id_trans -plots_filtered <- db5$get_crossval_plots(id_trans = db5$trans_models_t$id_trans[[1]]) -expect_true(is.list(plots_filtered)) +plots_filtered <- db$get_crossval_plots(id_trans = 1) expect_equal(length(plots_filtered), 1L) +plot_trans_1 <- plots_filtered[[1]] +expect_true(inherits(plot_trans_1, "gg")) +expect_equal( + plot_trans_1$data |> summary() |> as.vector(), + c( + "truth :219 ", + "response:219 ", + NA, + "Length:438 ", + "Class :character ", + "Mode :character " + ) +) diff --git a/man/evoland_db.Rd b/man/evoland_db.Rd index caa03a2..3d72f4d 100644 --- a/man/evoland_db.Rd +++ b/man/evoland_db.Rd @@ -80,6 +80,7 @@ Additional methods and active bindings are added to this class in separate files \item \href{#method-evoland_db-lulc_data_as_rast}{\code{evoland_db$lulc_data_as_rast()}} \item \href{#method-evoland_db-fit_full_models}{\code{evoland_db$fit_full_models()}} \item \href{#method-evoland_db-fit_partial_models}{\code{evoland_db$fit_partial_models()}} +\item \href{#method-evoland_db-get_crossval_plots}{\code{evoland_db$get_crossval_plots()}} \item \href{#method-evoland_db-set_full_trans_preds}{\code{evoland_db$set_full_trans_preds()}} \item \href{#method-evoland_db-get_pruned_trans_preds_t}{\code{evoland_db$get_pruned_trans_preds_t()}} \item \href{#method-evoland_db-predict_trans_pot}{\code{evoland_db$predict_trans_pot()}} @@ -401,13 +402,14 @@ NULL (default), all periods are included.} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-evoland_db-fit_full_models}{}}} \subsection{Method \code{fit_full_models()}}{ -Fit full models on complete data using the best partial model configuration for -each transition, see \code{\link[=fit_full_models]{fit_full_models()}} +Fit full models (trained on the complete dataset) for each viable transition, +see \code{\link[=fit_full_models]{fit_full_models()}}. Two mutually exclusive modes: pass \code{learner} to train +directly, or pass \code{select_score} to pick the best partial model by score. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{evoland_db$fit_full_models( - partial_models, - gof_criterion, - gof_maximize, + learner = NULL, + select_score = NULL, + select_maximize = TRUE, cluster = NULL )}\if{html}{\out{
}} } @@ -415,11 +417,13 @@ each transition, see \code{\link[=fit_full_models]{fit_full_models()}} \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{partial_models}}{A trans_models_t table with partial models (see \code{\link[=fit_partial_models]{fit_partial_models()}})} +\item{\code{learner}}{An mlr3 \code{Learner} or \code{AutoTuner} for direct-learner mode (\code{NULL} +when \code{select_score} is used).} -\item{\code{gof_criterion}}{Which goodness-of-fit metric to use for model selection (e.g., "auc")} +\item{\code{select_score}}{Measure ID string for score-select mode, e.g. \code{"classif.auc"} +(\code{NULL} when \code{learner} is used).} -\item{\code{gof_maximize}}{Maximize (TRUE) or minimize (FALSE) the gof_criterion?} +\item{\code{select_maximize}}{Logical; maximize (\code{TRUE}) or minimize (\code{FALSE}) the score.} \item{\code{cluster}}{Optional cluster object for parallel processing} } @@ -435,29 +439,45 @@ sampling. Models are trained on a subsample and evaluated on held-out data, see \code{\link[=fit_partial_models]{fit_partial_models()}} for details. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{evoland_db$fit_partial_models( - fit_fun, + learner, + measures, sample_frac = 0.7, - gof_fun, seed = NULL, - cluster = NULL, - ... + cluster = NULL )}\if{html}{\out{
}} } \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{fit_fun}}{Function for generating a model object.} +\item{\code{learner}}{An mlr3 \code{Learner} or \code{AutoTuner} R6 object.} -\item{\code{sample_frac}}{Fraction in \(0, 1\) for stratified sampling.} +\item{\code{measures}}{A list of mlr3 \code{Measure} objects for scoring the held-out split.} -\item{\code{gof_fun}}{Function to evaluate goodness of fit.} +\item{\code{sample_frac}}{Fraction in \(0, 1\) for stratified sampling.} \item{\code{seed}}{Random seed for reproducible sampling} \item{\code{cluster}}{Optional cluster object for parallel processing} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-evoland_db-get_crossval_plots}{}}} +\subsection{Method \code{get_crossval_plots()}}{ +Get cross-validation plots for stored predictions, see \code{\link[=get_crossval_plots]{get_crossval_plots()}} +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{evoland_db$get_crossval_plots(id_run = NULL, id_trans = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{id_run}}{Optional integer; filter by run ID.} -\item{\code{...}}{additional arguments passed to fit_fun} +\item{\code{id_trans}}{Optional integer; filter by transition ID.} } \if{html}{\out{
}} } diff --git a/man/parquet_db_utils.Rd b/man/parquet_db_utils.Rd index 40c0bcd..d6db2ae 100644 --- a/man/parquet_db_utils.Rd +++ b/man/parquet_db_utils.Rd @@ -79,7 +79,8 @@ hive style parquet file partitioning.} \item{table_name}{The name of the table to bind to.} \item{mode}{The mode of the binding, which determines the behavior when -committing data. Options are: "write_once" (default, only allows writing if table doesn't exist), "upsert"} +committing data. Options are: "write_once" (default, only allows writing if +table doesn't exist), "upsert", "append", and "overwrite".} \item{fun}{The underlying function to bind as an R6 method, which must have a \code{self} argument} diff --git a/man/trans_models_t.Rd b/man/trans_models_t.Rd index c92c778..8855e31 100644 --- a/man/trans_models_t.Rd +++ b/man/trans_models_t.Rd @@ -6,36 +6,44 @@ \alias{fit_partial_models} \alias{fit_full_models} \alias{print.trans_models_t} +\alias{get_crossval_plots} \title{Create Transition Models Table} \usage{ as_trans_models_t(x) fit_partial_models( self, - fit_fun, - gof_fun, + learner, + measures, sample_frac = 0.7, seed = NULL, - cluster = NULL, - ... + cluster = NULL ) -fit_full_models(self, gof_criterion, gof_maximize, cluster = NULL) +fit_full_models( + self, + learner = NULL, + select_score = NULL, + select_maximize = TRUE, + cluster = NULL +) \method{print}{trans_models_t}(x) + +get_crossval_plots(self, id_run = NULL, id_trans = NULL) } \arguments{ \item{x}{A list or data.frame coercible to a data.table} -\item{self, }{\link{evoland_db} instance to query for transitions and predictor data} +\item{self}{\link{evoland_db} instance} -\item{fit_fun}{Function that takes a data.frame with predictors and did_transition columns -and returns a fitted model object. The data argument is passed as the first argument -to the function, and additional arguments can be passed via ...} +\item{learner}{An mlr3 \code{Learner} or \code{AutoTuner} object for direct-learner mode. +Must be \code{NULL} when \code{select_score} is provided.} -\item{gof_fun}{Function that takes a fitted model object and a test data.frame and -returns a list of goodness-of-fit metrics. The model argument is passed as the first -argument and the test_data argument is passed as the second argument.} +\item{measures}{Either a character vector of mlr3 measure IDs +(e.g. \code{c("classif.auc", "classif.acc")}) or a list of instantiated mlr3 +\code{Measure} objects (e.g. \code{list(mlr3::msr("classif.auc"))}). Character IDs are +converted via \code{mlr3::msrs()} internally. Results are written to \code{crossval_score}.} \item{sample_frac}{Numeric between 0 and 1 indicating the fraction of data to use for training the partial models. The rest is used for @@ -47,33 +55,39 @@ testing).} \item{cluster}{An optional cluster object created by \code{\link[parallel:makeCluster]{parallel::makeCluster()}} or \code{\link[mirai:make_cluster]{mirai::make_cluster()}}.} -\item{gof_criterion}{Character string specifying which goodness-of-fit metric to use for -selecting the best partial model for each transition (e.g., "roc_auc", "rmse").} +\item{select_score}{Character string; mlr3 measure ID (e.g. \code{"classif.auc"}) used to +rank partial models in score-select mode. Must be \code{NULL} when \code{learner} is provided.} + +\item{select_maximize}{Logical; if \code{TRUE} (default) the model with the highest +\code{select_score} is selected; if \code{FALSE}, the lowest. Only used in score-select mode.} -\item{gof_maximize}{Logical indicating whether to select the model with the maximum -(TRUE) or minimum (FALSE) value of the specified goodness-of-fit criterion. Default -is TRUE.} +\item{id_run}{Optional integer; filter by run ID.} -\item{partial_models}{A trans_models_t table containing the fitted partial models and -their goodness-of-fit metrics.} +\item{id_trans}{Optional integer; filter by transition ID.} } \value{ A data.table of class "trans_models_t" with columns: \itemize{ \item \code{id_run}: Foreign key to runs_t \item \code{id_trans}: Foreign key to trans_meta_t -\item \code{model_family}: Model family (e.g., "rf", "glm", "bayesian") -\item \code{model_params}: Map of model (hyper) parameters -\item \code{goodness_of_fit}: Map of various measures of fit (e.g., ROC AUC, RMSE) -\item \code{fit_call}: Character string of the original fit function call for reproducibility -\item \code{model_obj_part}: BLOB of serialized model object for validation -\item \code{model_obj_full}: BLOB of serialized model object for extrapolation +\item \code{learner_id}: mlr3 twoclass \href{https://mlr3.mlr-org.com/reference/LearnerClassif.html}{LearnerClassif} +key, e.g. \code{"classif.ranger"} +\item \code{learner_params}: MAP of atomic scalar learner hyperparameters for +querying; complete hyperparameters are captured by \code{learner_spec} +\item \code{learner_spec}: BLOB of serialized untrained mlr3 \code{Learner}; for +AutoTuners, this is the optimal inner learner after tuning +\item \code{crossval_score}: MAP of cross-validation performance scores +(from \code{prediction$score(measures)}) +\item \code{crossval_predictions}: BLOB of serialized mlr3 \code{PredictionClassif} +on the held-out test split +\item \code{learner_full}: BLOB of serialized trained mlr3 \code{Learner} fitted on +the full dataset, used for extrapolation } } \description{ Creates a trans_models_t table for storing transition model metadata and serialized model objects. This function creates an empty table with proper -structure for storing fitted models. +structure for storing fitted models via the mlr3 interface. } \section{Methods (by generic)}{ \itemize{ @@ -82,10 +96,26 @@ structure for storing fitted models. }} \section{Functions}{ \itemize{ -\item \code{fit_partial_models()}: Fit partial models for each viable transition and store -results in a trans_models_t table. +\item \code{fit_partial_models()}: Fit partial (cross-validation) models for each viable +transition; returns a \link{trans_models_t} object with one row per viable transition, +containing the learner identity, serialized spec, cross-validation scores +(\code{crossval_score}), and serialized held-out predictions (\code{crossval_predictions}). + +\item \code{fit_full_models()}: Fit full models (trained on the complete dataset) for each +viable transition and return a \link{trans_models_t} object with \code{learner_full} populated. +Two mutually exclusive modes are supported: +\itemize{ +\item \strong{Direct-learner mode} (\code{learner} provided, \code{select_score} omitted): a fresh clone of +\code{learner} is trained on the full data for each transition. \code{crossval_score} and +\code{crossval_predictions} will be \code{NULL} in the result. Does not require a prior +call to \code{\link[=fit_partial_models]{fit_partial_models()}}. +\item \strong{Score-select mode} (\code{select_score} provided, \code{learner} omitted): selects the best +partial model per transition by \code{select_score}, reconstructs its learner from +\code{learner_spec}, and retrains on the full data. Requires \code{\link[=fit_partial_models]{fit_partial_models()}} to +have been run first. +} -\item \code{fit_full_models()}: Fit full models for each transition based on the best -partial model according to a specified goodness-of-fit criterion. +\item \code{get_crossval_plots()}: Deserialize cross-validation predictions and return +plots via \code{mlr3viz::autoplot()}. Requires the \code{mlr3viz} package. }} From ed1db6e4450cb2a5421409025a3a0c7668a09726 Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Fri, 24 Apr 2026 17:04:00 +0200 Subject: [PATCH 11/15] adapt vignette to mlr3 --- vignettes/evoland.qmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vignettes/evoland.qmd b/vignettes/evoland.qmd index 135fc3e..29daac0 100644 --- a/vignettes/evoland.qmd +++ b/vignettes/evoland.qmd @@ -375,8 +375,8 @@ We can then pick the models with the best goodness of fit to retrain full models ```{r} #| label: trans-models db$trans_models_t <- db$fit_partial_models( - fit_fun = fit_glm, - gof_fun = gof_glm, + learner = mlr3::lrn("classif.featureless", predict_type = "prob"), + measures = c("classif.auc", "classif.acc"), sample_frac = 0.7, seed = 42 ) From 0f9e0f61de55d7bfb761c614504fcf525f215c1b Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Mon, 27 Apr 2026 10:34:59 +0200 Subject: [PATCH 12/15] use partition directly --- R/trans_models_t.R | 38 ++++++++++++-------------------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/R/trans_models_t.R b/R/trans_models_t.R index 3e62451..874d242 100644 --- a/R/trans_models_t.R +++ b/R/trans_models_t.R @@ -431,8 +431,9 @@ fit_full_models <- function( # Identify the best partial model per transition (using QUALIFY window function) # and get the predictor ID lists from trans_preds_t in the same query - best_model_ids <- self$get_query(glue::glue( - r"[ + best_models <- + self$get_query(glue::glue( + r"[ with preds_nested as ( select id_run, @@ -448,6 +449,10 @@ fit_full_models <- function( tm.id_trans, tm.learner_id, pn.id_pred, + tm.learner_spec, + tm.learner_params, + tm.crossval_score, + tm.crossval_predictions, from {self$get_read_expr("trans_models_t")} tm, preds_nested pn @@ -458,31 +463,12 @@ fit_full_models <- function( partition by tm.id_run, tm.id_trans order by tm.crossval_score['{select_score}'] {ifelse(select_maximize, "desc", "asc")} ) = 1; - ]" - )) - - # Fetch all trans_models_t columns (including MAP/BLOB) for the best rows in one shot - learner_id_csv <- paste0("'", best_model_ids$learner_id, "'", collapse = ", ") - best_specs <- self$fetch( - "trans_models_t", - cols = c( - "id_run", - "id_trans", - "learner_id", - "learner_spec", - "learner_params", - "crossval_score", - "crossval_predictions" - ), - where = glue::glue( - "id_run in ({toString(best_model_ids$id_run)}) and ", - "id_trans in ({toString(best_model_ids$id_trans)}) and ", - "learner_id in ({learner_id_csv})" + ]" + )) |> + convert_list_cols( + c("learner_params", "crossval_score"), + kv_df_to_list ) - ) - - # Join to add id_pred from the ranking query - best_models <- best_model_ids[best_specs, on = c("id_run", "id_trans", "learner_id")] message(glue::glue( "Fitting full models for {nrow(best_models)} transitions..." From ff66ba39ddfc769239c812298c8cba701315cb31 Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Mon, 27 Apr 2026 15:17:57 +0200 Subject: [PATCH 13/15] drop custom fit/crossvalidation functions --- DESCRIPTION | 2 - NAMESPACE | 4 -- R/trans_models_glm.R | 103 -------------------------------- R/trans_models_rf.R | 136 ------------------------------------------- man/fit_glm.Rd | 30 ---------- man/fit_ranger.Rd | 43 -------------- man/gof_glm.Rd | 33 ----------- man/gof_ranger.Rd | 36 ------------ 8 files changed, 387 deletions(-) delete mode 100644 R/trans_models_glm.R delete mode 100644 R/trans_models_rf.R delete mode 100644 man/fit_glm.Rd delete mode 100644 man/fit_ranger.Rd delete mode 100644 man/gof_glm.Rd delete mode 100644 man/gof_ranger.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 622ea9c..d016ffd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -63,8 +63,6 @@ Collate: 'reporting_t.R' 'runs_t.R' 'trans_meta_t.R' - 'trans_models_glm.R' - 'trans_models_rf.R' 'trans_pot_t.R' 'trans_preds_t.R' 'trans_rates_t.R' diff --git a/NAMESPACE b/NAMESPACE index a2d0d75..5135e9b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -71,10 +71,6 @@ export(evoland_db) export(exec_dinamica) export(extract_using_coords_t) export(extrapolate_trans_rates) -export(fit_glm) -export(fit_ranger) -export(gof_glm) -export(gof_ranger) export(grrf_filter) export(parquet_db) export(print_rowwise_yaml) diff --git a/R/trans_models_glm.R b/R/trans_models_glm.R deleted file mode 100644 index d48d895..0000000 --- a/R/trans_models_glm.R +++ /dev/null @@ -1,103 +0,0 @@ -#' GLM Model Fitting for Transition Models -#' -#' Fits a generalized linear model (GLM) with quasibinomial family for transition -#' modeling. The quasibinomial family is recommended over binomial as it better -#' handles overdispersion in the data. -#' -#' @param data A data.table containing the result column and predictor columns -#' (prefixed with "id_pred_") -#' @param ... Additional arguments (currently ignored, for future extensibility) -#' -#' @return A fitted GLM model object, optionally butchered to reduce memory footprint -#' -#' @details -#' The function: -#' - Uses quasibinomial family to handle overdispersion -#' - Automatically detects predictor columns (those starting with "id_pred_") -#' - Applies butcher::butcher() if the package is available to reduce model size -#' -#' @export -fit_glm <- function(data, ...) { - pred_cols <- grep("^id_pred_", names(data), value = TRUE) - - if (length(pred_cols) == 0) { - stop("No predictor columns found (expected columns starting with 'id_pred_')") - } - - # Create formula - formula_str <- paste("did_transition", "~", paste(pred_cols, collapse = " + ")) - formula <- as.formula(formula_str) - - model <- glm(formula, data = data, family = quasibinomial()) - - # clean up the object - model[["model"]] <- NULL - model[["residuals"]] <- NULL - model[["fitted.values"]] <- NULL - model[["effects"]] <- NULL - model[["qr"]][["qr"]] <- NULL - model[["linear.predictors"]] <- NULL - model[["weights"]] <- NULL - model[["prior.weights"]] <- NULL - model[["y"]] <- NULL - attr(model[["formula"]], ".Environment") <- NULL - - # just to be sure - if (requireNamespace("butcher", quietly = TRUE)) { - model <- butcher::butcher(model) - } - - return(model) -} - - -#' Goodness of Fit Evaluation for GLM Models -#' -#' Evaluates the goodness of fit for a GLM model on test data, computing multiple -#' performance metrics including correlation, MSE, and AUC. -#' -#' @param model A fitted GLM model object (from fit_glm) -#' @param test_data A data.table containing test data with the same structure as -#' training data -#' @param ... Additional arguments (currently unused, for future extensibility) -#' -#' @return A named list containing: -#' - `cor`: Pearson correlation between predictions and actual values -#' - `mse`: Mean squared error -#' - `auc`: Area under the ROC curve (if pROC package is available) -#' - `n_test`: Number of test observations -#' -#' @details -#' The function uses the pROC package for AUC calculation if available. If pROC -#' is not installed, AUC will be NA. -#' -#' @export -gof_glm <- function(model, test_data, ...) { - predictions <- predict(model, newdata = test_data, type = "response") - actual <- test_data[["did_transition"]] - - # Correlation-based metric - cor_metric <- cor(predictions, actual, use = "complete.obs") - - # Mean squared error - mse <- mean((predictions - actual)^2, na.rm = TRUE) - - # AUC if pROC is available - auc <- NA_real_ - if (requireNamespace("pROC", quietly = TRUE)) { - roc_obj <- pROC::roc( - actual, - predictions, - quiet = TRUE, - direction = "<" - ) - auc <- as.numeric(pROC::auc(roc_obj)) - } - - list( - cor = cor_metric, - mse = mse, - auc = auc, - n_test = nrow(test_data) - ) -} diff --git a/R/trans_models_rf.R b/R/trans_models_rf.R deleted file mode 100644 index 2c03c43..0000000 --- a/R/trans_models_rf.R +++ /dev/null @@ -1,136 +0,0 @@ -#' Random Forest Model Fitting for Transition Models -#' -#' Fits a random forest model using the ranger package for transition modeling. -#' Uses observation-based weighting and stratified downsampling to handle class -#' imbalance. -#' -#' @param data A data.table containing the did_transition column and predictor columns -#' (prefixed with "id_pred_") -#' @param num.trees Number of trees to grow in the random forest (default: 100) -#' @param max.depth Maximum depth of each tree (default: 100) -#' @param ... Additional arguments passed to [ranger::ranger()] -#' -#' @return A fitted ranger model object, optionally butchered to reduce memory footprint -#' -#' @details -#' The function: -#' - Uses ranger for efficient random forest implementation -#' - Applies observation-based weights (same approach as grrf_filter) -#' - Uses stratified downsampling via sample.fraction -#' - Returns probability predictions for the positive class -#' - Computes variable importance using impurity measure -#' - Applies butcher::butcher() if available to reduce model size -#' -#' Default hyperparameters: -#' - num.trees = 500 -#' - min.node.size = 1 -#' -#' @export -fit_ranger <- function(data, num.trees = 100, max.depth = 100, ...) { - if (!requireNamespace("ranger", quietly = TRUE)) { - stop( - "Package 'ranger' is required but is not installed.\n", - "Please install it with: install.packages('ranger')", - call. = FALSE - ) - } - - pred_cols <- grep("^id_pred_", names(data), value = TRUE) - - if (length(pred_cols) == 0) { - stop("No predictor columns found (expected columns starting with 'id_pred_')") - } - - # Prepare data - x <- data[, ..pred_cols] - y <- as.factor(data[["did_transition"]]) - - # Compute observation-based weights (same approach as grrf_filter) - weights <- compute_balanced_weights(data[["did_transition"]]) - - # Get minority class size for downsampling - class_counts <- table(y) - nmin <- min(class_counts) - - # Fit ranger model - model <- ranger::ranger( - x = x, - y = y, - num.trees = num.trees, - case.weights = weights, - probability = TRUE, # For probability predictions - importance = "impurity", - max.depth = max.depth, - ... - ) - - # Butcher the model if package is available - if (requireNamespace("butcher", quietly = TRUE)) { - model <- butcher::butcher(model) - } - - return(model) -} - - -#' Goodness of Fit Evaluation for Random Forest Models -#' -#' Evaluates the goodness of fit for a ranger random forest model on test data, -#' computing multiple performance metrics including correlation, MSE, AUC, and -#' out-of-bag error. -#' -#' @param model A fitted ranger model object (from fit_ranger) -#' @param test_data A data.table containing test data with the same structure as -#' training data -#' @param ... Additional arguments (currently unused, for future extensibility) -#' -#' @return A named list containing: -#' - `cor`: Pearson correlation between predictions and actual values -#' - `mse`: Mean squared error -#' - `auc`: Area under the ROC curve (if pROC package is available) -#' - `oob_error`: Out-of-bag prediction error from the model -#' - `n_test`: Number of test observations -#' -#' @details -#' The function extracts probability predictions for the TRUE class from the ranger -#' model. It uses the pROC package for AUC calculation if available. If pROC is not -#' installed, AUC will be NA. -#' -#' @export -gof_ranger <- function(model, test_data) { - pred_cols <- grep("^id_pred_", names(test_data), value = TRUE) - x_test <- test_data[, ..pred_cols] - - # Get probability predictions for the TRUE class - predictions <- predict(model, data = x_test)$predictions[, "TRUE"] - actual <- as.numeric(test_data[["did_transition"]]) - - # Correlation-based metric - cor_metric <- cor(predictions, actual, use = "complete.obs") - - # Mean squared error - mse <- mean((predictions - actual)^2, na.rm = TRUE) - - # AUC if pROC is available - auc <- NA_real_ - if (requireNamespace("pROC", quietly = TRUE)) { - roc_obj <- pROC::roc( - actual, - predictions, - quiet = TRUE, - direction = "<" - ) - auc <- as.numeric(pROC::auc(roc_obj)) - } - - # Out-of-bag error from model - oob_error <- model$prediction.error - - list( - cor = cor_metric, - mse = mse, - auc = auc, - oob_error = oob_error, - n_test = nrow(test_data) - ) -} diff --git a/man/fit_glm.Rd b/man/fit_glm.Rd deleted file mode 100644 index 0cdf55e..0000000 --- a/man/fit_glm.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/trans_models_glm.R -\name{fit_glm} -\alias{fit_glm} -\title{GLM Model Fitting for Transition Models} -\usage{ -fit_glm(data, ...) -} -\arguments{ -\item{data}{A data.table containing the result column and predictor columns -(prefixed with "id_pred_")} - -\item{...}{Additional arguments (currently ignored, for future extensibility)} -} -\value{ -A fitted GLM model object, optionally butchered to reduce memory footprint -} -\description{ -Fits a generalized linear model (GLM) with quasibinomial family for transition -modeling. The quasibinomial family is recommended over binomial as it better -handles overdispersion in the data. -} -\details{ -The function: -\itemize{ -\item Uses quasibinomial family to handle overdispersion -\item Automatically detects predictor columns (those starting with "id_pred_") -\item Applies butcher::butcher() if the package is available to reduce model size -} -} diff --git a/man/fit_ranger.Rd b/man/fit_ranger.Rd deleted file mode 100644 index 18bbd47..0000000 --- a/man/fit_ranger.Rd +++ /dev/null @@ -1,43 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/trans_models_rf.R -\name{fit_ranger} -\alias{fit_ranger} -\title{Random Forest Model Fitting for Transition Models} -\usage{ -fit_ranger(data, num.trees = 100, max.depth = 100, ...) -} -\arguments{ -\item{data}{A data.table containing the did_transition column and predictor columns -(prefixed with "id_pred_")} - -\item{num.trees}{Number of trees to grow in the random forest (default: 100)} - -\item{max.depth}{Maximum depth of each tree (default: 100)} - -\item{...}{Additional arguments passed to \code{\link[ranger:ranger]{ranger::ranger()}}} -} -\value{ -A fitted ranger model object, optionally butchered to reduce memory footprint -} -\description{ -Fits a random forest model using the ranger package for transition modeling. -Uses observation-based weighting and stratified downsampling to handle class -imbalance. -} -\details{ -The function: -\itemize{ -\item Uses ranger for efficient random forest implementation -\item Applies observation-based weights (same approach as grrf_filter) -\item Uses stratified downsampling via sample.fraction -\item Returns probability predictions for the positive class -\item Computes variable importance using impurity measure -\item Applies butcher::butcher() if available to reduce model size -} - -Default hyperparameters: -\itemize{ -\item num.trees = 500 -\item min.node.size = 1 -} -} diff --git a/man/gof_glm.Rd b/man/gof_glm.Rd deleted file mode 100644 index 60b3fca..0000000 --- a/man/gof_glm.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/trans_models_glm.R -\name{gof_glm} -\alias{gof_glm} -\title{Goodness of Fit Evaluation for GLM Models} -\usage{ -gof_glm(model, test_data, ...) -} -\arguments{ -\item{model}{A fitted GLM model object (from fit_glm)} - -\item{test_data}{A data.table containing test data with the same structure as -training data} - -\item{...}{Additional arguments (currently unused, for future extensibility)} -} -\value{ -A named list containing: -\itemize{ -\item \code{cor}: Pearson correlation between predictions and actual values -\item \code{mse}: Mean squared error -\item \code{auc}: Area under the ROC curve (if pROC package is available) -\item \code{n_test}: Number of test observations -} -} -\description{ -Evaluates the goodness of fit for a GLM model on test data, computing multiple -performance metrics including correlation, MSE, and AUC. -} -\details{ -The function uses the pROC package for AUC calculation if available. If pROC -is not installed, AUC will be NA. -} diff --git a/man/gof_ranger.Rd b/man/gof_ranger.Rd deleted file mode 100644 index 9c1dfa3..0000000 --- a/man/gof_ranger.Rd +++ /dev/null @@ -1,36 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/trans_models_rf.R -\name{gof_ranger} -\alias{gof_ranger} -\title{Goodness of Fit Evaluation for Random Forest Models} -\usage{ -gof_ranger(model, test_data) -} -\arguments{ -\item{model}{A fitted ranger model object (from fit_ranger)} - -\item{test_data}{A data.table containing test data with the same structure as -training data} - -\item{...}{Additional arguments (currently unused, for future extensibility)} -} -\value{ -A named list containing: -\itemize{ -\item \code{cor}: Pearson correlation between predictions and actual values -\item \code{mse}: Mean squared error -\item \code{auc}: Area under the ROC curve (if pROC package is available) -\item \code{oob_error}: Out-of-bag prediction error from the model -\item \code{n_test}: Number of test observations -} -} -\description{ -Evaluates the goodness of fit for a ranger random forest model on test data, -computing multiple performance metrics including correlation, MSE, AUC, and -out-of-bag error. -} -\details{ -The function extracts probability predictions for the TRUE class from the ranger -model. It uses the pROC package for AUC calculation if available. If pROC is not -installed, AUC will be NA. -} From e0615f9cc662aecc0a556fc862340de986b1e0e1 Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Thu, 30 Apr 2026 08:36:21 +0200 Subject: [PATCH 14/15] get_pruned_trans_preds_t -> get_pred_filter_score --- DESCRIPTION | 1 + R/evoland_db.R | 17 ++-- R/trans_models_t.R | 9 +- R/trans_preds_t.R | 109 +++++++++++------------ inst/tinytest/test_integ_trans_preds_t.R | 32 ++++--- 5 files changed, 88 insertions(+), 80 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d016ffd..4e77335 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,6 +20,7 @@ Imports: duckdb (>= 1.5.2), glue, mlr3, + mlr3filters, qs2, R6, Rcpp, diff --git a/R/evoland_db.R b/R/evoland_db.R index 0693518..b6cd75f 100644 --- a/R/evoland_db.R +++ b/R/evoland_db.R @@ -222,18 +222,19 @@ evoland_db <- R6::R6Class( create_method_binding(set_full_trans_preds) }, - #' @description Remove predictors from the transition-predictor relation, aka - #' feature selection. See [get_pruned_trans_preds_t()]. - #' @param filter_fun Defaults to [covariance_filter()], see - #' [get_pruned_trans_preds_t()] for details. + #' @description Add filter scores to predictors for each `id_run, id_trans`. + #' See [get_pred_filter_score()]. + #' @param filter Character passed to [mlr3filters::flt] or + #' [mlr3filters::Filter] object specifying the filter method to use for + #' feature selection. #' @param cluster Optional cluster object for parallel processing - #' @param ... Additional arguments passed to `filter_fun`. - get_pruned_trans_preds_t = function( - filter_fun = covariance_filter, + #' @param ... Additional arguments passed to `flt`. + get_pred_filter_score = function( + filter = "correlation", cluster = NULL, ... ) { - create_method_binding(get_pruned_trans_preds_t) + create_method_binding(get_pred_filter_score) }, #' @description diff --git a/R/trans_models_t.R b/R/trans_models_t.R index 874d242..4a1ad1a 100644 --- a/R/trans_models_t.R +++ b/R/trans_models_t.R @@ -74,7 +74,7 @@ fit_partial_model_worker <- function( # if seed is set, we want ordering for reproducible sampling ordered = !is.null(seed) - ) + )[, -c("id_coord", "id_period_anterior")] if (nrow(trans_pred_data_full) == 0L) { stop(glue::glue( @@ -105,10 +105,9 @@ fit_partial_model_worker <- function( sample(idx_false, n_train_false) ) - # Subset to task columns (did_transition + predictors) - task_cols <- c("did_transition", pred_cols) - train_data <- trans_pred_data_full[train_idx, .SD, .SDcols = task_cols] - test_data <- trans_pred_data_full[-train_idx, .SD, .SDcols = task_cols] + # Split + train_data <- trans_pred_data_full[train_idx] + test_data <- trans_pred_data_full[-train_idx] # Coerce target; mlr3 uses factors internally also for twoclass classification train_data[, did_transition := factor(did_transition, levels = c("FALSE", "TRUE"))] diff --git a/R/trans_preds_t.R b/R/trans_preds_t.R index 7c6b4fe..421c886 100644 --- a/R/trans_preds_t.R +++ b/R/trans_preds_t.R @@ -108,89 +108,87 @@ set_full_trans_preds <- function(self, overwrite = FALSE) { ) } -# Worker function for parallel transition pruning -# Not exported; used internally by get_pruned_trans_preds_t -prune_trans_worker <- function(item, db, filter_fun, ordered_pred_data = FALSE, ...) { - # item is just a data.table slice. expecting scalar id_run and id_trans +# Worker function for parallel mlr3filter::Filter +# Not exported; used internally by get_pred_filter_score +pred_filter_worker <- function(item, db, filter, ordered_pred_data = FALSE) { + stopifnot(inherits(filter, "Filter"), inherits(item, "data.frame")) + # item has constant id_run and id_trans; extract first value into scalar id_run <- item[["id_run"]][1L] id_trans <- item[["id_trans"]][1L] id_pred <- item[["id_pred"]] + filter_id <- filter$id tryCatch( { # Get wide transition-predictor data - trans_pred_data <- db$trans_pred_data_v( + trans_pred_data_v <- db$trans_pred_data_v( id_trans = id_trans, id_pred = id_pred, ordered = ordered_pred_data - ) + )[, -c("id_coord", "id_period_anterior")] - # Check if we have any data - pred_cols <- grep("^id_pred_", names(trans_pred_data), value = TRUE) - if (nrow(trans_pred_data) == 0L || length(pred_cols) == 0L) { + if (nrow(trans_pred_data_v) == 0L) { stop(glue::glue( "No data for transition {id_trans}; not pruning" )) } - # Return ranked + filtered predictor names as id_pred_{n} - filtered_preds <- filter_fun( - # drop vars that are irrelevant to the filtering - data = trans_pred_data[, .SD, .SDcols = !c("id_coord", "id_period_anterior")], - ... + # Coerce target; mlr3 uses factors internally also for twoclass classification + trans_pred_data_v[, did_transition := factor(did_transition, levels = c("FALSE", "TRUE"))] + + filter_task <- mlr3::as_task_classif( + trans_pred_data_v, + target = "did_transition", + positive = "TRUE" ) - if (length(filtered_preds) == 0L) { - stop(glue::glue( - "Filter dropped all predictors for {id_trans}; not pruning" - )) - } - # Parse id_pred values from column names (e.g., "id_pred_1" -> 1) - selected_ids <- as.integer(sub("^id_pred_", "", filtered_preds)) - - # Create result rows - return(data.table::data.table( - id_run = id_run, - id_pred = selected_ids, - id_trans = id_trans - )) + # this _will_ error if the filter is incompatible with the data. should we + # hard error? + filter$calculate(filter_task) + + scores_dt <- + data.table::as.data.table(filter) |> + setNames(c("id_pred", filter_id)) + + scores_dt[, id_pred := as.integer(sub("^id_pred_", "", id_pred))] + scores_dt[, id_run := id_run] + scores_dt[, id_trans := id_trans] + + return(scores_dt) }, error = function(e) { - # do not prune on error - warning(glue::glue( - "Error processing transition {id_trans}: {e$message}" - )) - return(data.table::data.table( - id_run = id_run, - id_pred = id_pred, - id_trans = id_trans - )) + warning(glue::glue("Error processing id_trans?={id_trans}: {e$message}")) + item[[filter_id]] <- NA_real_ + return(item) } ) } -# TODO: mlr3filters (https://mlr3filters.mlr-org.com/) provides a range of filter methods -# (mutual information, permutation importance, correlation-based, etc.) that could replace or -# supplement the current covariance_filter approach here. This refactoring should follow the -# same patterns as trans_models_t: accept an mlr3 Filter object (or id string + params) in -# place of filter_fun, store the filter id and parameters as DuckDB-native MAP columns, and -# serialize the filter object as a BLOB for full reproducibility. - -#' @describeIn trans_preds_t Get a pruned set of transition-predictor relationships -#' based on a filtering function -#' @param filter_fun A function that takes a transition-predictor data (cf. [trans_pred_data_v]) and -#' returns a character vector of column names to keep, see e.g. [covariance_filter] -#' @param na_value Value to use for missing data when retrieving predictor data +#' @describeIn trans_preds_t Get a filter score for all transition-predictor +#' relationships based on mlr3filters. Returns trans_preds_t with an additional +#' column named after the filter$id. The filter score can be used for feature +#' selection: simply subset according to the score and overwrite trans_preds_t +#' in the database using `db$trans_preds_t <- trans_preds_t[score > threshold]` +#' or similar. +#' @param filter An [mlr3filters::Filter] object or a character string +#' specifying the filter method, retrieved via [mlr3filters::flt]. Note that your +#' filter must be compatible with the feature data types; compare your +#' `pred_meta_t` table to for filter compatibility. #' @param cluster An optional cluster object, see [run_parallel_evoland] -#' @param ordered_pred_data Bool, should the predictor data be ordered? needed +#' @param ordered_pred_data Bool, should the predictor data be ordered? Needed #' for fully deterministic behavior -get_pruned_trans_preds_t <- function( +#' @param ... Additional arguments passed to `flt` if `filter` is a character string +get_pred_filter_score <- function( self, - filter_fun = covariance_filter, + filter, cluster = NULL, ordered_pred_data = FALSE, ... ) { + # Accept either a character vector of measure IDs or a list of Measure objects + if (is.character(filter)) { + filter <- mlr3filters::flt(filter, ...) + } if (self$row_count("trans_preds_t") == 0) { self$set_full_trans_preds() } @@ -200,12 +198,11 @@ get_pruned_trans_preds_t <- function( run_parallel_evoland( items = items, - worker_fun = prune_trans_worker, + worker_fun = pred_filter_worker, parent_db = self, cluster = cluster, - filter_fun = filter_fun, - ordered_pred_data = ordered_pred_data, - ... + filter = filter, + ordered_pred_data = ordered_pred_data ) |> data.table::rbindlist() |> as_trans_preds_t() diff --git a/inst/tinytest/test_integ_trans_preds_t.R b/inst/tinytest/test_integ_trans_preds_t.R index ce690e7..e0f0d95 100644 --- a/inst/tinytest/test_integ_trans_preds_t.R +++ b/inst/tinytest/test_integ_trans_preds_t.R @@ -15,26 +15,36 @@ db <- make_test_db(include_neighbors = FALSE, include_trans_preds = FALSE) # Test empty table expect_stdout(print(as_trans_preds_t()), "Transition-Predictor Relationships") +# suppress info logs from mlr3 during testing +lgr::get_logger("mlr3")$set_threshold("warn") + +set.seed(123) # Test covariance filter expect_message( - cov_results <- db$get_pruned_trans_preds_t( - filter_fun = covariance_filter, - corcut = 0.03, # absurdly low to force pruning for testing - ordered_pred_data = TRUE # for deterministic behavior in testing + cov_results <- db$get_pred_filter_score( + # the default performance measure for a classification task is classif.ce + # with minimize TRUE so we expect scores in [-1,0] + filter = mlr3filters::FilterPerformance$new(resampling = mlr3::rsmp("cv", folds = 2)), + ordered_pred_data = TRUE # for deterministic behavior ), "Processing 2 transitions" ) + cov_expected <- as_trans_preds_t(data.table::rowwiseDT( - id_run=, id_pred=, id_trans=, - 0, 1, 1, - 0, 2, 1, - 0, 2, 2, - 0, 3, 2, - 0, 4, 1 + id_run=, id_pred=, id_trans=, performance=, + 0, 1, 1, -0.4515679, + 0, 1, 2, -0.4639171, + 0, 2, 1, -0.4515679, + 0, 2, 2, -0.4639171, + 0, 3, 1, -0.4515679, + 0, 3, 2, -0.4639171, + 0, 4, 1, -0.4515679, + 0, 4, 2, -0.4639171 )) -expect_equal(cov_results, cov_expected) +expect_equal(cov_results, cov_expected, tol = 1e-7) +exit_file("grrf filter test skipped, not implemented as mlr3 filter yet") # test grrf filter with custom params expect_silent(db$set_full_trans_preds(overwrite = TRUE)) expect_message( From 936a75c2d88d8570d5e6de78f6dbbea87656f993 Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Thu, 30 Apr 2026 12:31:16 +0200 Subject: [PATCH 15/15] drop covariance_filter, update docs --- DESCRIPTION | 1 - NAMESPACE | 1 - R/covariance_filter.R | 178 --------------------------------------- man/covariance_filter.Rd | 83 ------------------ man/evoland_db.Rd | 45 +++++----- man/trans_preds_t.Rd | 26 +++--- 6 files changed, 36 insertions(+), 298 deletions(-) delete mode 100644 R/covariance_filter.R delete mode 100644 man/covariance_filter.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 4e77335..7086472 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -44,7 +44,6 @@ Collate: 'trans_models_t.R' 'alloc_dinamica.R' 'coords_t.R' - 'covariance_filter.R' 'parquet_db_utils.R' 'parquet_db.R' 'evoland_db.R' diff --git a/NAMESPACE b/NAMESPACE index 5135e9b..32444f4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -57,7 +57,6 @@ export(as_trans_preds_t) export(as_trans_rates_t) export(calc_fuzzy_similarity) export(calc_transition_similarity) -export(covariance_filter) export(create_change_map) export(create_coords_t_square) export(create_intrv_meta_t) diff --git a/R/covariance_filter.R b/R/covariance_filter.R deleted file mode 100644 index 4966bd6..0000000 --- a/R/covariance_filter.R +++ /dev/null @@ -1,178 +0,0 @@ -#' Two stage covariate filtering -#' -#' The `covariance_filter` returns a set of covariates for land use land cover change -#' (LULCC) models based on a two-stage variable selection: a first statistical fit -#' estimates a covariate's quality for a given prediction task. A second step selects -#' all variables below a given correlation threshold: We iterate over a correlation -#' matrix ordered in the first step. Starting within the leftmost column, all rows (i.e. -#' candidates) greater than the given threshold are dropped from the full set of -#' candidates. This candidate selection is retained and used to select the next column, -#' until no further columns are left to investigate. The columns that were iterated over -#' are those returned as a character vector of selected variable names. -#' -#' @param data A data.table of target variable and candidate covariates to be filtered; -#' wide format with one predictor per column, except a binary "did_transition" column -#' (0: no trans, 1: trans) -#' @param rank_fun Optional function to compute ranking scores for each covariate. -#' Should take arguments (x, y, weights, ...) and return a single numeric value -#' (lower = better). Defaults to polynomial GLM p-value ranking. -#' @param weights Optional vector of weights to be used in the ranking function. Defaults to -#' class-balanced weights -#' @param corcut Numeric threshold (0-1) for correlation filtering. Covariates with correlation -#' coefficients above this threshold will be filtered out. Default is 0 (no filtering). -#' @param ... Additional arguments passed to rank_fun. -#' -#' @return A set of column names (covariates) to retain -#' -#' @details -#' The function first ranks covariates using the provided ranking function (default: -#' quasibinomial polynomial GLM). Then, it iteratively removes highly (Pearson) -#' correlated variables based on the correlation cutoff threshold, preserving variables -#' in order of their ranking. See -#' for -#' where the concept came from. The original author was Antoine Adde, with edits by -#' Benjamin Black. A similar mechanism is found in . -#' -#' @name covariance_filter -#' -#' @export - -covariance_filter <- function( - data, - rank_fun = rank_poly_glm, - weights = compute_balanced_weights(data[["did_transition"]]), - corcut = 0.7, - ... -) { - # Early return for single covariate - if (ncol(data) == 1) { - return(data) - } - - # Validate binary outcome - stopifnot( - "corcut must be between 0 and 1" = corcut >= 0 && corcut <= 1 - ) - - for (col in names(data)) { - # TODO this whole polynomial preliminary ranking + covariance approach for - # correlation filtering feels a bit ad hoc (see @details) and is likely not - # the best choice, as it is not robust against different data types (e.g. - # factors). forcing everything to numeric for now. - data.table::set(data, j = col, value = as.numeric(data[[col]])) - } - - # Compute ranking scores for all covariates (vectorized where possible) - scores <- vapply( - data[, -"did_transition"], - rank_fun, - FUN.VALUE = numeric(1), - y = data[["did_transition"]], - weights = weights, - ... - ) - - # Sort by scores (lower = better/more significant) - ranked_order <- names(sort(scores)) - - # If no correlation filtering needed, return ranked predictors - if (corcut == 1) { - return(ranked_order) - } - - # Compute correlation matrix once - cor_mat <- abs(cor(data[, ..ranked_order], use = "pairwise.complete.obs")) - - # Iteratively select covariates based on correlation threshold - select_by_correlation(cor_mat, corcut) -} - - -#' @describeIn covariance_filter Default ranking function using polynomial GLM. Returns -#' the lower p value for each of the polynomial terms -#' @param x A numeric vector representing a single covariate -#' @param y A binary outcome vector (0/1) -#' @param weights Optional weights vector -#' @keywords internal -rank_poly_glm <- function(x, y, weights = NULL, ...) { - if (data.table::uniqueN(x) < 3) { - design_matrix <- as.numeric(x) # in case of logical / factor below 3 unique values - } else { - design_matrix <- cbind(1, poly(x, degree = 2, simple = TRUE)) - } - - fit <- glm.fit( - x = design_matrix, - y = y, - family = quasibinomial(), - weights = weights - ) - - # Get p-values for all terms (intercept + polynomial terms) - coef_summary <- summary.glm(fit)$coefficients - - # Return minimum p-value (most significant term) - min(coef_summary[, 4], na.rm = TRUE) -} - - -#' @describeIn covariance_filter Compute class-balanced weights for imbalanced binary -#' outcomes; returns a numeric vector -#' @param trans_result Binary outcome vector (0/1) -#' @param legacy Bool, use legacy weighting? -#' @keywords internal -compute_balanced_weights <- function(trans_result, legacy = FALSE) { - n_total <- length(trans_result) - n_trans <- sum(trans_result) - n_non_trans <- sum(!trans_result) - - # Compute inverse frequency weights - weights <- numeric(n_total) - - if (legacy) { - # I found this weighting in evoland-plus-legacy, but the models wouldn't converge - # https://github.com/ethzplus/evoland-plus-legacy/blob/main/R/lulcc.splitforcovselection.r - # This is actually just setting the underrepresented class to the rounded imbalance ratio - weights[!trans_result] <- 1 - weights[trans_result] <- round(n_non_trans / n_trans) - return(weights) - } - - # This is the heuristic in scikit-learn, n_samples / (n_classes * np.bincount(y)) - # https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html #nolint - # This weighting maintains the exact imbalance ratio - weights[trans_result] <- n_total / (2 * n_trans) - weights[!trans_result] <- n_total / (2 * n_non_trans) - - weights -} - - -#' @describeIn covariance_filter Implements the iterative selection procedure. -#' @param cor_mat Absolute correlation matrix -#' @param corcut Correlation cutoff threshold -#' @keywords internal -select_by_correlation <- function(cor_mat, corcut) { - var_names <- colnames(cor_mat) - - # Early return if all correlations are below threshold - if (all(cor_mat[lower.tri(cor_mat)] < corcut)) { - return(var_names) - } - - selected <- character(0) - remaining_idx <- seq_along(var_names) - - while (length(remaining_idx) > 0) { - # Select the first remaining variable (highest ranked) - current_var <- remaining_idx[1] - selected <- c(selected, var_names[current_var]) - - # Find variables with correlation <= corcut with current variable - # (excluding the variable itself) - keep_idx <- which(cor_mat[remaining_idx, current_var] <= corcut) - remaining_idx <- remaining_idx[keep_idx] - } - - selected -} diff --git a/man/covariance_filter.Rd b/man/covariance_filter.Rd deleted file mode 100644 index ed2865c..0000000 --- a/man/covariance_filter.Rd +++ /dev/null @@ -1,83 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/covariance_filter.R -\name{covariance_filter} -\alias{covariance_filter} -\alias{rank_poly_glm} -\alias{compute_balanced_weights} -\alias{select_by_correlation} -\title{Two stage covariate filtering} -\usage{ -covariance_filter( - data, - rank_fun = rank_poly_glm, - weights = compute_balanced_weights(data[["did_transition"]]), - corcut = 0.7, - ... -) - -rank_poly_glm(x, y, weights = NULL, ...) - -compute_balanced_weights(trans_result, legacy = FALSE) - -select_by_correlation(cor_mat, corcut) -} -\arguments{ -\item{data}{A data.table of target variable and candidate covariates to be filtered; -wide format with one predictor per column, except a binary "did_transition" column -(0: no trans, 1: trans)} - -\item{rank_fun}{Optional function to compute ranking scores for each covariate. -Should take arguments (x, y, weights, ...) and return a single numeric value -(lower = better). Defaults to polynomial GLM p-value ranking.} - -\item{weights}{Optional weights vector} - -\item{corcut}{Correlation cutoff threshold} - -\item{...}{Additional arguments passed to rank_fun.} - -\item{x}{A numeric vector representing a single covariate} - -\item{y}{A binary outcome vector (0/1)} - -\item{trans_result}{Binary outcome vector (0/1)} - -\item{legacy}{Bool, use legacy weighting?} - -\item{cor_mat}{Absolute correlation matrix} -} -\value{ -A set of column names (covariates) to retain -} -\description{ -The \code{covariance_filter} returns a set of covariates for land use land cover change -(LULCC) models based on a two-stage variable selection: a first statistical fit -estimates a covariate's quality for a given prediction task. A second step selects -all variables below a given correlation threshold: We iterate over a correlation -matrix ordered in the first step. Starting within the leftmost column, all rows (i.e. -candidates) greater than the given threshold are dropped from the full set of -candidates. This candidate selection is retained and used to select the next column, -until no further columns are left to investigate. The columns that were iterated over -are those returned as a character vector of selected variable names. -} -\details{ -The function first ranks covariates using the provided ranking function (default: -quasibinomial polynomial GLM). Then, it iteratively removes highly (Pearson) -correlated variables based on the correlation cutoff threshold, preserving variables -in order of their ranking. See -\url{https://github.com/ethzplus/evoland-plus-legacy/blob/main/R/lulcc.covfilter.r} for -where the concept came from. The original author was Antoine Adde, with edits by -Benjamin Black. A similar mechanism is found in \url{https://github.com/antadde/covsel/}. -} -\section{Functions}{ -\itemize{ -\item \code{rank_poly_glm()}: Default ranking function using polynomial GLM. Returns -the lower p value for each of the polynomial terms - -\item \code{compute_balanced_weights()}: Compute class-balanced weights for imbalanced binary -outcomes; returns a numeric vector - -\item \code{select_by_correlation()}: Implements the iterative selection procedure. - -}} -\keyword{internal} diff --git a/man/evoland_db.Rd b/man/evoland_db.Rd index 3d72f4d..13cae14 100644 --- a/man/evoland_db.Rd +++ b/man/evoland_db.Rd @@ -64,7 +64,6 @@ Additional methods and active bindings are added to this class in separate files \section{Methods}{ \subsection{Public methods}{ \itemize{ -\item \href{#method-evoland_db-trans_rates_dinamica_v}{\code{evoland_db$trans_rates_dinamica_v()}} \item \href{#method-evoland_db-new}{\code{evoland_db$new()}} \item \href{#method-evoland_db-get_read_expr}{\code{evoland_db$get_read_expr()}} \item \href{#method-evoland_db-print}{\code{evoland_db$print()}} @@ -82,9 +81,10 @@ Additional methods and active bindings are added to this class in separate files \item \href{#method-evoland_db-fit_partial_models}{\code{evoland_db$fit_partial_models()}} \item \href{#method-evoland_db-get_crossval_plots}{\code{evoland_db$get_crossval_plots()}} \item \href{#method-evoland_db-set_full_trans_preds}{\code{evoland_db$set_full_trans_preds()}} -\item \href{#method-evoland_db-get_pruned_trans_preds_t}{\code{evoland_db$get_pruned_trans_preds_t()}} +\item \href{#method-evoland_db-get_pred_filter_score}{\code{evoland_db$get_pred_filter_score()}} \item \href{#method-evoland_db-predict_trans_pot}{\code{evoland_db$predict_trans_pot()}} \item \href{#method-evoland_db-get_obs_trans_rates}{\code{evoland_db$get_obs_trans_rates()}} +\item \href{#method-evoland_db-trans_rates_dinamica_v}{\code{evoland_db$trans_rates_dinamica_v()}} \item \href{#method-evoland_db-clone}{\code{evoland_db$clone()}} } } @@ -105,15 +105,6 @@ Additional methods and active bindings are added to this class in separate files }} \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-trans_rates_dinamica_v}{}}} -\subsection{Method \code{trans_rates_dinamica_v()}}{ -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$trans_rates_dinamica_v(id_period)}\if{html}{\out{
}} -} - -} -\if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-evoland_db-new}{}}} \subsection{Method \code{new()}}{ @@ -500,28 +491,25 @@ Set an initial full set of transition / predictor relations, see \code{\link[=se } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-get_pruned_trans_preds_t}{}}} -\subsection{Method \code{get_pruned_trans_preds_t()}}{ -Remove predictors from the transition-predictor relation, aka -feature selection. See \code{\link[=get_pruned_trans_preds_t]{get_pruned_trans_preds_t()}}. +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-evoland_db-get_pred_filter_score}{}}} +\subsection{Method \code{get_pred_filter_score()}}{ +Add filter scores to predictors for each \verb{id_run, id_trans}. +See \code{\link[=get_pred_filter_score]{get_pred_filter_score()}}. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$get_pruned_trans_preds_t( - filter_fun = covariance_filter, - cluster = NULL, - ... -)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{evoland_db$get_pred_filter_score(filter = "correlation", cluster = NULL, ...)}\if{html}{\out{
}} } \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{filter_fun}}{Defaults to \code{\link[=covariance_filter]{covariance_filter()}}, see -\code{\link[=get_pruned_trans_preds_t]{get_pruned_trans_preds_t()}} for details.} +\item{\code{filter}}{Character passed to \link[mlr3filters:flt]{mlr3filters::flt} or +\link[mlr3filters:Filter]{mlr3filters::Filter} object specifying the filter method to use for +feature selection.} \item{\code{cluster}}{Optional cluster object for parallel processing} -\item{\code{...}}{Additional arguments passed to \code{filter_fun}.} +\item{\code{...}}{Additional arguments passed to \code{flt}.} } \if{html}{\out{
}} } @@ -556,6 +544,15 @@ Get the transition rates that were observed, see \link{trans_rates_t} \if{html}{\out{
}}\preformatted{evoland_db$get_obs_trans_rates()}\if{html}{\out{
}} } +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-evoland_db-trans_rates_dinamica_v}{}}} +\subsection{Method \code{trans_rates_dinamica_v()}}{ +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{evoland_db$trans_rates_dinamica_v(id_period)}\if{html}{\out{
}} +} + } \if{html}{\out{
}} \if{html}{\out{}} diff --git a/man/trans_preds_t.Rd b/man/trans_preds_t.Rd index 14f6680..e5a8ca0 100644 --- a/man/trans_preds_t.Rd +++ b/man/trans_preds_t.Rd @@ -5,7 +5,7 @@ \alias{as_trans_preds_t} \alias{print.trans_preds_t} \alias{set_full_trans_preds} -\alias{get_pruned_trans_preds_t} +\alias{get_pred_filter_score} \title{Create Transition-Predictor Relationship Table} \usage{ as_trans_preds_t(x) @@ -14,9 +14,9 @@ as_trans_preds_t(x) set_full_trans_preds(self, overwrite = FALSE) -get_pruned_trans_preds_t( +get_pred_filter_score( self, - filter_fun = covariance_filter, + filter, cluster = NULL, ordered_pred_data = FALSE, ... @@ -25,21 +25,21 @@ get_pruned_trans_preds_t( \arguments{ \item{nrow}{see \link[data.table:print.data.table]{data.table::print.data.table}} -\item{...}{passed to \link[data.table:print.data.table]{data.table::print.data.table}} +\item{...}{Additional arguments passed to \code{flt} if \code{filter} is a character string} \item{overwrite}{Bool, should a potentially existing table be overwritten?} -\item{filter_fun}{A function that takes a transition-predictor data (cf. \link{trans_pred_data_v}) and -returns a character vector of column names to keep, see e.g. \link{covariance_filter}} +\item{filter}{An \link[mlr3filters:Filter]{mlr3filters::Filter} object or a character string +specifying the filter method, retrieved via \link[mlr3filters:flt]{mlr3filters::flt}. Note that your +filter must be compatible with the feature data types; compare your +\code{pred_meta_t} table to \url{https://mlr3filters.mlr-org.com} for filter compatibility.} \item{cluster}{An optional cluster object, see \link{run_parallel_evoland}} -\item{ordered_pred_data}{Bool, should the predictor data be ordered? needed +\item{ordered_pred_data}{Bool, should the predictor data be ordered? Needed for fully deterministic behavior} \item{db}{An \link{evoland_db} instance with populated trans_meta_t and pred_meta_t tables} - -\item{na_value}{Value to use for missing data when retrieving predictor data} } \value{ A data.table of class "trans_preds_t" with columns: @@ -63,7 +63,11 @@ modelling each transition type. \itemize{ \item \code{set_full_trans_preds()}: Set an initial full set of transition / predictor relations -\item \code{get_pruned_trans_preds_t()}: Get a pruned set of transition-predictor relationships -based on a filtering function +\item \code{get_pred_filter_score()}: Get a filter score for all transition-predictor +relationships based on mlr3filters. Returns trans_preds_t with an additional +column named after the filter$id. The filter score can be used for feature +selection: simply subset according to the score and overwrite trans_preds_t +in the database using \code{db$trans_preds_t <- trans_preds_t[score > threshold]} +or similar. }}