diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 27d45283..f9cb681d 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -4,9 +4,10 @@ on: push: branches: [main, master] pull_request: - branches: [main, master] -name: test-coverage +name: test-coverage.yaml + +permissions: read-all jobs: test-coverage: @@ -15,7 +16,7 @@ jobs: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-r@v2 with: @@ -23,28 +24,39 @@ jobs: - uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: any::covr + extra-packages: any::covr, any::xml2 needs: coverage - name: Test coverage run: | - covr::codecov( + cov <- covr::package_coverage( quiet = FALSE, clean = FALSE, install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package") ) + print(cov) + covr::to_cobertura(cov) shell: Rscript {0} + - uses: codecov/codecov-action@v5 + with: + # Fail if error if not on PR, or if on PR and token is given + fail_ci_if_error: ${{ github.event_name != 'pull_request' || secrets.CODECOV_TOKEN }} + files: ./cobertura.xml + plugins: noop + disable_search: true + token: ${{ secrets.CODECOV_TOKEN }} + - name: Show testthat output if: always() run: | ## -------------------------------------------------------------------- - find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true + find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true shell: bash - name: Upload test results if: failure() - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: coverage-test-failures - path: ${{ runner.temp }}/package + path: ${{ runner.temp }}/package \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index cfc21d6d..eedc5038 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,13 +1,17 @@ Package: VIM -Version: 6.2.4 +Version: 7.0.0 Title: Visualization and Imputation of Missing Values Authors@R: c( person("Matthias", "Templ", email = "matthias.templ@gmail.com", role = c("aut","cre")), person("Alexander", "Kowarik", email = "alexander.kowarik@statistik.gv.at", role = c("aut"), comment=c(ORCID="0000-0001-8598-4130")), person("Andreas", "Alfons", role = c("aut")), + person("Johannes", "Gussenbauer", role = c("aut")), + person("Nina", "Niederhametner", role = c("aut")), + person("Eileen", "Vattheuer", role = c("aut")), person("Gregor", "de Cillia", email = "gregor.decillia@statistik.gv.at", role = c("aut")), person("Bernd", "Prantner", role = c("ctb")), - person("Wolfgang", "Rannetbauer", role = c("aut"))) + person("Wolfgang", "Rannetbauer", role = c("aut")) + ) Depends: R (>= 4.1.0),colorspace,grid Imports: @@ -27,7 +31,14 @@ Imports: ranger, MASS, xgboost, - data.table(>= 1.9.4) + data.table(>= 1.9.4), + mlr3, + mlr3pipelines, + R6, + paradox, + mlr3tuning, + mlr3learners, + future Suggests: dplyr, tinytest, @@ -35,7 +46,8 @@ Suggests: rmarkdown, reactable, covr, - withr + withr, + glmnet Description: New tools for the visualization of missing and/or imputed values are introduced, which can be used for exploring the data and the structure of the missing and/or imputed values. Depending on this structure of the missing @@ -51,7 +63,7 @@ License: GPL (>= 2) URL: https://github.com/statistikat/VIM Repository: CRAN LinkingTo: Rcpp -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.3 Encoding: UTF-8 Roxygen: list(markdown = TRUE) VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index c3182e8d..d737a16f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -50,6 +50,7 @@ export(scattMiss) export(scattmatrixMiss) export(spineMiss) export(tableMiss) +export(vimpute) export(xgboostImpute) import(Rcpp) import(colorspace) @@ -92,11 +93,36 @@ importFrom(graphics,text) importFrom(graphics,title) importFrom(laeken,weightedMean) importFrom(laeken,weightedMedian) +importFrom(mlr3,LearnerClassif) +importFrom(mlr3,LearnerRegr) +importFrom(mlr3,PredictionClassif) +importFrom(mlr3,PredictionRegr) +importFrom(mlr3,TaskClassif) +importFrom(mlr3,TaskRegr) +importFrom(mlr3,as_task_classif) +importFrom(mlr3,as_task_regr) +importFrom(mlr3,lrn) +importFrom(mlr3,msr) +importFrom(mlr3,resample) +importFrom(mlr3,rsmp) +importFrom(mlr3learners,LearnerRegrCVGlmnet) +importFrom(mlr3pipelines,"%>>%") +importFrom(mlr3pipelines,GraphLearner) +importFrom(mlr3pipelines,PipeOpModelMatrix) +importFrom(mlr3pipelines,po) +importFrom(mlr3tuning,TuningInstanceBatchSingleCrit) +importFrom(mlr3tuning,tnr) +importFrom(mlr3tuning,trm) +importFrom(paradox,p_dbl) +importFrom(paradox,p_fct) +importFrom(paradox,p_int) +importFrom(paradox,ps) importFrom(ranger,importance) importFrom(ranger,ranger) importFrom(utils,capture.output) importFrom(utils,flush.console) importFrom(utils,head) +importFrom(utils,modifyList) importFrom(vcd,labeling_border) importFrom(vcd,mosaic) useDynLib(VIM) diff --git a/NEWS.md b/NEWS.md index 82a1ce8c..9c30fa90 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,6 @@ +# VIM 7.0.0 + - new function vimpute that uses mlr3 backend for a flexible imputation method. + # VIM 6.x.x - fix infinite loop in matchImpute in case all observations of a variable are missing - remove parameter metric from kNN because it was not used diff --git a/R/VIM-package.R b/R/VIM-package.R index 84c5a441..3ba53030 100644 --- a/R/VIM-package.R +++ b/R/VIM-package.R @@ -17,12 +17,17 @@ #' @importFrom laeken weightedMean #' @importFrom graphics Axis abline axTicks axis barplot box hist boxplot layout lcm lines locator par plot.new plot.window points #' @importFrom graphics polygon rect strheight strwidth text title -#' @importFrom utils capture.output flush.console head +#' @importFrom utils capture.output flush.console head modifyList #' @importFrom ranger ranger importance #' @importFrom MASS stepAIC lqs polr rlm +#' @importFrom mlr3 LearnerRegr PredictionRegr LearnerClassif PredictionClassif lrn TaskRegr TaskClassif as_task_regr as_task_classif rsmp msr resample +#' @importFrom mlr3pipelines PipeOpModelMatrix %>>% GraphLearner po +#' @importFrom paradox ps p_fct p_dbl p_int +#' @importFrom mlr3tuning tnr trm TuningInstanceBatchSingleCrit +#' @importFrom mlr3learners LearnerRegrCVGlmnet #' @useDynLib VIM NULL - +utils::globalVariables(c("self", "super")) #' Animals_na #' #' @description Average log brain and log body weights for 28 Species @@ -596,12 +601,7 @@ NULL #' handling of the plot methods. In addition, `VIM` can be used for data #' from essentially any field. #' -#' @name VIM-package -#' @aliases VIM-package VIM -#' @docType package -#' @author Matthias Templ, Andreas Alfons, Alexander Kowarik, Bernd Prantner -#' -#' Maintainer: Matthias Templ + #' @references M. Templ, A. Alfons, P. Filzmoser (2012) Exploring incomplete #' data using visualization tools. *Journal of Advances in Data Analysis #' and Classification*, Online first. DOI: 10.1007/s11634-011-0102-y. @@ -609,8 +609,8 @@ NULL #' M. Templ, A. Kowarik, P. Filzmoser (2011) Iterative stepwise regression #' imputation using standard and robust methods. *Journal of #' Computational Statistics and Data Analysis*, Vol. 55, pp. 2793-2806. -#' @keywords package -NULL +#' @keywords internal +"_PACKAGE" diff --git a/R/helper_vimpute.R b/R/helper_vimpute.R new file mode 100644 index 00000000..baf6238f --- /dev/null +++ b/R/helper_vimpute.R @@ -0,0 +1,515 @@ +register_robust_learners <- function() { + + # Robust Regression Learner + LearnerRegrRobustLM = R6::R6Class( + classname = "LearnerRegrRobustLM", + inherit = LearnerRegr, + public = list( + initialize = function() { + param_set = ps( + method = p_fct(c("M", "MM"), default = "MM"), + psi = p_fct(c("bisquare", "lqq", "optimal"), default = "bisquare"), + tuning.chi = p_dbl(lower = 0, upper = Inf, default = 1.55), + tuning.psi = p_dbl(lower = 0, upper = Inf, default = 4.69), + setting = p_fct(c("KS2014", "KS2011"), default = "KS2014"), + max.it = p_int(lower = 1, upper = Inf, default = 50), + k.max = p_int(lower = 1, upper = Inf, default = 200), + nResample = p_int(lower = 1, upper = Inf, default = 500), + subsampling = p_fct(c("simple", "nonsingular"), default = "nonsingular"), + ridge_lambda = p_dbl(lower = 0, upper = 1, default = 1e-4), + refine.tol = p_dbl(lower = 0, upper = Inf, default = 1e-7), + solve.tol = p_dbl(lower = 0, upper = Inf, default = 1e-7), + trace.lev = p_int(lower = 0, upper = Inf, default = 0) + ) + + super$initialize( + id = "regr.lm_rob", + feature_types = c("numeric", "integer", "factor", "ordered"), + predict_types = c("response"), + packages = c("robustbase", "stats"), + man = "robustbase::lmrob", + param_set = param_set + ) + + self$param_set$values = list( + method = "MM", + psi = "bisquare", + tuning.chi = 1.55, + tuning.psi = 4.69, + setting = "KS2014", + max.it = 50, + k.max = 200, + nResample = 500, + subsampling = "nonsingular", + ridge_lambda = 1e-4, + refine.tol = 1e-7, + solve.tol = 1e-7, + trace.lev = 0 + ) + } + ), + + private = list( + .train = function(task) { + pv = self$param_set$get_values() + data = as.data.frame(task$data()) + target = task$target_names + features = task$feature_names + + factor_cols = sapply(data, is.factor) + if (any(factor_cols)) { + for (col in names(data)[factor_cols]) { + data[[col]] = droplevels(data[[col]]) + } + } + + formula = reformulate(features, response = target) + control = do.call(robustbase::lmrob.control, pv) + + model = tryCatch( + robustbase::lmrob(formula, data = data, control = control), + error = function(e) { + warning(sprintf("lmrob() failed for '%s': %s\nFalling back to lm()", target, e$message)) + NULL + } + ) + + if (is.null(model)) { + # Fallback: lm + model = lm(formula, data = data) + class(model) = c("lm_fallback", class(model)) + self$state$used_fallback = TRUE + } else { + self$state$used_fallback = FALSE + } + + factor_col_names = names(data)[factor_cols] # Namen der Faktor-Spalten + self$state$factor_levels = lapply(data[, factor_col_names, drop = FALSE], levels) + return(model) + }, + + .predict = function(task) { + model = self$model + newdata = as.data.frame(task$data()) + + if (!is.null(self$state$factor_levels)) { + for (var in names(self$state$factor_levels)) { + if (var %in% colnames(newdata) && is.factor(newdata[[var]])) { + new_levels = setdiff(levels(newdata[[var]]), self$state$factor_levels[[var]]) + if (length(new_levels) > 0) { + warning(sprintf("New levels (%s) in factor '%s' replaced with NA", + paste(new_levels, collapse = ", "), var)) + } + newdata[[var]] = factor(newdata[[var]], levels = self$state$factor_levels[[var]]) + } + } + } + + response = tryCatch({ + predict(model, newdata = newdata) + }, error = function(e) { + warning("Vorhersage fehlgeschlagen: ", e$message) + rep(NA_real_, nrow(newdata)) + }) + + PredictionRegr$new(task = task, response = response) + } + ) + ) + + mlr3::mlr_learners$add("regr.lm_rob", LearnerRegrRobustLM) + + + # robust Classification Learner + LearnerClassifGlmRob <- R6::R6Class( + inherit = mlr3::LearnerClassif, + public = list( + initialize = function() { + super$initialize( + id = "classif.glm_rob", + feature_types = c("numeric", "integer", "factor", "ordered"), + predict_types = c("response", "prob"), + packages = c("mlr3learners"), + properties = c("twoclass", "multiclass") + ) + self$state$learner = NULL + } + ), + + private = list( + .train = function(task) { + n_classes = length(task$class_names) + if (n_classes == 2) { + self$state$learner = mlr3::lrn("classif.log_reg", predict_type = self$predict_type) + } else { + self$state$learner = mlr3::lrn("classif.multinom", predict_type = self$predict_type) + } + self$state$learner$train(task) + }, + + .predict = function(task) { + if (is.null(self$state$learner)) stop("Model not trained yet") + pred = self$state$learner$predict(task) + return(pred) + } + ) + ) + + mlr3::mlr_learners$add("classif.glm_rob", LearnerClassifGlmRob) + +} + +# +# task = mlr3::tsk("iris")$filter(1:1000) # binary classification +# learner = mlr3::lrn("classif.glm_rob", predict_type = "prob") +# learner$train(task) +# pred = learner$predict(task) +# print(pred) + +### +++++++++++++++++++++++++++++++++ Helper Functions +++++++++++++++++++++++++++++++++ ### +# +# +# +# left handside formula +# extracts the left side of a formula and returns the name of the applied transformation (e.g. log) +identify_lhs_transformation <- function(formula) { + lhs <- as.character(formula)[2] # Extract the left side of the formula + + # Permitted transformations + transformations <- c("log", "sqrt", "exp", "I\\(1/", "boxcox") + + for (t in transformations) { + if (grepl(paste0("^", t), lhs)) { + return(ifelse(t == "I\\(1/", "inverse", gsub("\\\\", "", t))) # "I(1/" wird als "inverse" benannt + } + } + + return(NULL) # No transformation +} +# +# +# +# Transformation +# Identifies which variables need to be transformed in a formula, extracts transformations and operators, checks for existing variables and missing values and returns the results in a list. +identify_variables <- function(formula, data, target_col) { + data <- as.data.frame(data) + + if (is.list(formula)) { + results <- lapply(formula, function(f) identify_variables(f, data, target_col)) + return(results) + } + + # Extract formula as character string + formchar <- as.character(formula) + lhs <- gsub("^I\\(1/|log\\(|sqrt\\(|boxcox\\(|exp\\(|\\)$| ", "", formchar[2]) # Entferne Transformationen und Leerzeichen von der linken Seite + rhs <- ifelse(length(formchar) > 2, gsub(" ", "", formchar[3]), "") + + # Decompose the right-hand side according to all relevant operators + rhs_vars <- if (rhs != "") unlist(strsplit(rhs, "[-+*:/%()]")) else character(0) + rhs_vars <- rhs_vars[rhs_vars != ""] + + # Extract the original variable names without transformations + raw_lhs <- gsub("(log\\(|sqrt\\(|I\\(1/|boxcox\\(|exp\\(|\\))", "", lhs) + raw_rhs_vars <- gsub("(log\\(|sqrt\\(|I\\(1/|boxcox\\(|exp\\(|\\))", "", rhs_vars) + + # Remove duplicate variable names + raw_rhs_vars <- unique(raw_rhs_vars) + + # Identify transformations in the predictors and the response variable + transformations <- c( + ifelse(grepl("log\\(", formchar[2]), "log", "none"), + sapply(rhs_vars, function(var) { + if (grepl("log\\(", var)) return("log") + if (grepl("sqrt\\(", var)) return("sqrt") + if (grepl("^I\\(1/", var)) return("inverse") + if (grepl("boxcox\\(", var)) return("boxcox") + if (grepl("exp\\(", var)) return("exponential") + return("none") + }, USE.NAMES = FALSE) + ) + + # Identify model matrix operators + operator_mapping <- list( + ":" = "interaction", + "*" = "crossing", + "^" = "power", + "%in%" = "nested", + "/" = "sub-nested", + "-" = "exclusion" + ) + + operators <- unique(unlist(regmatches(rhs, gregexpr("[:*^%in%/-]", rhs)))) + operator_types <- setNames(operators, sapply(operators, function(op) operator_mapping[[op]])) + + # Initialize empty lists if no predictors are available + if (length(rhs_vars) == 0) { + transformations <- character(0) + raw_rhs_vars <- character(0) + } + + # Identify the type of variable (select only existing columns) + existing_vars <- c(raw_lhs, raw_rhs_vars) + existing_vars <- setdiff(existing_vars, target_col) + existing_vars <- existing_vars[existing_vars %in% colnames(data)] + variable_types <- sapply(data[, existing_vars, drop = FALSE], class) + + # Check missing values in the predictors + missing_values <- sapply(data[, intersect(raw_rhs_vars, colnames(data)), drop = FALSE], function(col) sum(is.na(col))) + + # Compile results + result <- list( + response_variable = raw_lhs, + predictor_variables = existing_vars, + transformations = setNames(transformations, c(lhs, rhs_vars)), + variable_types = variable_types, + missing_values = missing_values, + model_matrix_operators = operator_types + ) + + return(result) +} +# +# +# +# Find formula +# Selects from a list of formulas the one whose left side corresponds to the cleaned answer variable by removing transformations and spaces. +select_formula <- function(formula_list, response_variable) { + # Remove transformations and spaces from response variable + response_variable_cleaned <- gsub("^I\\(1/|^log\\(|^sqrt\\(|^boxcox\\(|^exp\\(|\\)$| ", "", response_variable) + + selected_formula <- Filter(function(f) { + # Remove transformations and spaces from the left-hand side of the formula + formula_lhs <- gsub("^I\\(1/|^log\\(|^sqrt\\(|^boxcox\\(|^exp\\(|\\)$| ", "", deparse(f[[2]])) + identical(formula_lhs, response_variable_cleaned) + }, formula_list) + + if (length(selected_formula) == 0) return(FALSE) + return(selected_formula[[1]]) +} +# +# +# +# Precheck +# Performs a series of preliminary checks, including checking for missing values, data types, formulas, PMM settings and supported methods, and returns the checked data and variable information. +precheck <- function( + data, + pmm, + formula, + method, + sequential +) { + + # check missing data + variables = colnames(data) + variables_NA = colnames(data)[apply(data, 2, function(x) any(is.na(x)))] # alle Variablen die missind data haben + if (length(variables_NA) == 0) { + stop ("Error: No missing data available") + } else { + message ("Variables with Missing Data: ", paste (variables_NA, collapse = ",")) + } + + # check data structure + if (is.data.table(data)) { + message("data is data.table") + } else if (is.matrix(data) || is.data.frame(data)) { + data <- as.data.table(data) + } else { + stop("Error: Input must be a dataframe or a data.table.") + } + + # check formula + if (!identical(formula, FALSE)) { + if (!is.list(formula)) { + stop("Error: 'formula' must be either FALSE or a list of formulas.") + } + + for (var in names(formula)) { + if (!inherits(formula[[var]], "formula")) { + stop(paste("Error: Element for ", var, " is not a valid formula!")) + } + + # Extract variables from the formula + form_vars <- all.vars(formula[[var]]) + + # Check whether all variables are present in the data + missing_vars <- setdiff(form_vars, names(data)) + if (length(missing_vars) > 0) { + stop(paste("Missing variables in data:", paste(missing_vars, collapse = ", "))) + } + } + } + + # check pmm + check_pmm <- function( + pmm, + variables + ) { + if (!(is.logical(pmm) && length(pmm) == 1) && + !(is.list(pmm) && (length(pmm) == length(variables) || length(pmm) == length(variables_NA)) && all(sapply(pmm, is.logical)))) { + stop("Error: pmm must contain a list of the length of 'variables' or 'considered_variables' (if specified) with TRUE/FALSE values.") + } + } + check_pmm(pmm, variables) + + # check methods + supported_methods <- c("ranger", "regularized", "xgboost", "robust") + + # proove methods + if (length(method) == 0) { + message("Methods are empty, no imputation is used.") + } else if ((is.character(method) && length(method) == 1) || + (is.list(method) && length(method) == 1 && is.character(method[[1]]))) { + # If `method` is a single string or a single-element list, set for all variables + method_value <- if (is.list(method)) method[[1]] else method + if (!(method_value %in% supported_methods)) { + stop("Error: The provided method is not supported.") + } + method <- setNames(as.list(rep(method_value, length(variables))), variables) + } else if (length(method) == length(variables) || length(method) == length(variables_NA)) { + # If `method` has the same length as `variables` or the number of variables matches NAs + if (is.list(method) && all(sapply(method, is.character))) { + # Check whether all elements in `method` are strings and contain valid methods + if (!all(unlist(method) %in% supported_methods)) { + stop("Error: One or more methods in the list are not supported.") + } + } else { + stop("Error: 'method' must be a list of string values.") + } + } else { + stop("Error: 'method' must either be empty, a single string, a single-element list, have the same length as 'variables' or 'considered_variables' (if specified), or the number of variables must match NAs.") + } + + # warning if more than 50% missing values + if (nrow(data) == 0) stop("Error: Data has no rows.") + missing_counts <- colSums(is.na(data)) + na_cols <- names(missing_counts[missing_counts > 0.5 * nrow(data)]) + if (length(na_cols) > 0) { + warning(paste("Warning: The following variables have more than 50% missing values:", paste(na_cols, collapse = ", "))) + } + + # Datatypes + data[, (variables) := lapply(.SD, function(x) { + if (is.numeric(x)) { + as.numeric(x) # Integer & Double in Numeric + } else if (is.character(x)) { + as.factor(x) # Strings in Factors + } else if (is.logical(x)) { + as.numeric(x) # TRUE/FALSE -> 1/0 + } else if (is.factor(x)) { + x + } else { + stop("Error: Unknown datatype") + } + }), .SDcols = variables] + + message("Precheck done.") + return(list(data=data, variables=variables, variables_NA=variables_NA, method=method)) +} + +# +# +# +# Semicontinous variables +is_semicontinuous <- function(x) { + is.numeric(x) && + any(x == 0, na.rm = TRUE) && + any(x > 0, na.rm = TRUE) && + sum(x == 0, na.rm = TRUE) / sum(!is.na(x)) > 0.1 && + !all(na.omit(x) %in% c(0, 1)) # check that it is not binary +} + +# +# +# +# factor levels +enforce_factor_levels <- function(df, original_levels) { + for (colname in names(original_levels)) { + levels <- original_levels[[colname]] + if (is.null(levels)) next # skip numeric variables + + if (colname %in% names(df)) { + if (is.factor(df[[colname]]) || is.character(df[[colname]])) { + vals <- df[[colname]] + unknown_levels <- setdiff(unique(vals[!is.na(vals)]), levels) + + if (length(unknown_levels) > 0) { + warning(sprintf("Column '%s' has unknown level: %s", + colname, paste(unknown_levels, collapse = ", "))) + } + + df[[colname]] <- factor(vals, levels = levels) + } + } + } + return(df) +} + +# +# +# +# check levels +check_all_factor_levels <- function(df, factor_levels) { + for (var in names(factor_levels)) { + if (var %in% names(df) && is.factor(df[[var]])) { + missing_levels <- setdiff(levels(df[[var]]), factor_levels[[var]]) + new_levels <- setdiff(factor_levels[[var]], levels(df[[var]])) + if (length(missing_levels) > 0 || length(new_levels) > 0) { + stop(sprintf( + "Level mismatch in variable '%s':\n Levels in data: %s\n Expected levels: %s", + var, + paste(levels(df[[var]]), collapse = ", "), + paste(factor_levels[[var]], collapse = ", ") + )) + } + } + } +} + +# +# +# +# new levels -> NA +set_new_levels_to_na <- function(df, factor_levels, data_y_fill_final, skip_methods = c("xgboost"), method_var = NULL) { + if (!is.null(method_var) && !(method_var %in% skip_methods)) { + for (col in names(factor_levels)) { + if (col %in% names(df) && is.factor(df[[col]]) && col %in% names(data_y_fill_final)) { + valid_levels <- levels(data_y_fill_final[[col]]) + df[[col]][!df[[col]] %in% valid_levels & !is.na(df[[col]])] <- NA + } + } + } + return(df) +} + +# +# +# +# +check_factor_levels <- function(data, original_levels) { + for (colname in names(original_levels)) { + if (colname %in% names(data)) { + if (is.factor(data[[colname]])) { + data_levels <- levels(data[[colname]]) + ref_levels <- original_levels[[colname]] + + # 1. Check for missing levels (levels in the original that are not in the current factor) + missing_levels <- setdiff(ref_levels, data_levels) + if (length(missing_levels) > 0) { + warning(sprintf("Column '%s': Missing levels in factor: %s", colname, paste(missing_levels, collapse = ", "))) + } + + # 2. Check for new levels (levels in the data factor that are not in the original) + new_levels <- setdiff(data_levels, ref_levels) + if (length(new_levels) > 0) { + warning(sprintf("Column '%s': New levels in factor: %s", colname, paste(new_levels, collapse = ", "))) + } + + # 3. Check whether values in the data factor are outside the defined levels (can happen if factors are not set correctly) + invalid_values <- setdiff(unique(as.character(data[[colname]])), ref_levels) + if (length(invalid_values) > 0) { + warning(sprintf("Column '%s': Invalid value: %s", colname, paste(invalid_values, collapse = ", "))) + } + } + } + } +} diff --git a/R/vimpute.R b/R/vimpute.R new file mode 100644 index 00000000..72f4ee7b --- /dev/null +++ b/R/vimpute.R @@ -0,0 +1,1482 @@ + +#' Impute missing values with prefered Model, sequentially, with hyperparametertuning and with PMM (if wanted) +#' Need of 'helper_vimpute' script + +## PARAMETERS ## +#' @param data - Dataset with missing values. Can be provided as a data.table or data.frame. +#' @param considered_variables - A character vector of variable names to be either imputed or used as predictors, excluding irrelevant columns from the imputation process. +#' @param method - A named list specifying the imputation method for each variable: +# - ranger +# - xgboost +# - regularized +# - robust +#' @param pmm - TRUE/FALSE indicating whether predictive mean matching is used. Provide as a list for each variable. +#' @param formula - If not all variables are used as predictors, or if transformations or interactions are required (applies to all X, for Y only transformations are possible). Only applicable for the methods "robust" and "regularized". Provide as a list for each variable that requires specific conditions. +# - formula format: list(variable_1 ~ age + lenght, variable_2 ~ width + country) +# - formula format with transformation: list(log(variable_1) ~ age + inverse(lenght), variable_2 ~ width + country) +# - For X: follows the rules of model.matrix +# - For Y: transformations supported are log(), exp(), sqrt(), I(1/..). Only applicable for numeric variables. +#' @param sequential - If TRUE, all variables are imputed sequentially. +#' @param nseq - Maximum number of iterations (if sequential is TRUE). +#' @param eps - Threshold for convergence. +#' @param imp_var - If TRUE, the imputed values are stored. +#' @param pred_history - If TRUE, all predicted values across all iterations are stored. +#' @param tune - Tunes hyperparameters halfway through iterations, TRUE or FALSE. +#' @param verbose - If TRUE additional debugging output is provided +#' @return imputed data set or c(imputed data set, prediction history) +#' @export +#' +#' @family imputation methods +#' @examples +#' \dontrun{ +#' x <- vimpute(data = sleep, sequential = FALSE) +#' y <- vimpute(data = sleep, sequential = TRUE, nseq = 3) +#' z <- vimpute(data = sleep, considered_variables = +#' c("Sleep", "Dream", "Span", "BodyWgt"), sequential = FALSE) +#' } +######################################################################################### +######################################################################################### +######################################################################################### + +vimpute <- function( + data, + considered_variables = names(data), + method = setNames(as.list(rep("ranger", length(considered_variables))), considered_variables), + pmm = setNames(as.list(rep(TRUE, length(considered_variables))), considered_variables), + formula = FALSE, + sequential = TRUE, + nseq = 10, + eps = 0.005, + imp_var = TRUE, + pred_history = FALSE, + tune = FALSE, + verbose = FALSE +) { + ..cols <- ..feature_cols <- ..reg_features <- ..relevant_features <- NULL + # save plan + old_plan <- future::plan() # Save current plan + on.exit(future::plan(old_plan), add = TRUE) # Restore on exit, even if error + + ### ***** Learner START ***** ################################################################################################### + no_change_counter <- 0 + robust_required <- any(unlist(method) == "robust") + + if (robust_required) { + register_robust_learners() + } + + learner_ids <- c( + "regr.cv_glmnet", "regr.glmnet", "classif.glmnet", + "regr.ranger", "classif.ranger", + "regr.xgboost", "classif.xgboost" + ) + + if (robust_required) { + learner_ids <- c(learner_ids, "regr.lm_rob", "classif.glm_rob") + } + + learners <- lapply(learner_ids, function(id) lrn(id)) + names(learners) <- learner_ids + ### Learner End ### + + # only defined variables + data_all_variables <- as.data.table(data) + data <- as.data.table(data)[, considered_variables, with = FALSE] + + # save factor levels + factor_levels <- list() + for (col in names(data)) { + if (is.factor(data[[col]])) { + factor_levels[[col]] <- levels(data[[col]]) + } else if (is.character(data[[col]])) { + factor_levels[[col]] <- unique(na.omit(data[[col]])) + } + } + + # message("factor levels:") + # message(capture.output(print(factor_levels))) + + ### ***** Check Data Start ***** ################################################################################################### + if(verbose){ + message(paste("***** Check Data")) + } + checked_data <- precheck(data, pmm, formula, method, sequential) + data <- checked_data$data + variables <- checked_data$variables + variables_NA <- checked_data$variables_NA + method <- checked_data$method + + if (!sequential && nseq > 1) { + if (verbose) message ("'nseq' was set to 1 because 'sequential = FALSE'.") + nseq <- 1 + } + + #orig_data <- data + + ### Check Data End ### + + ### ***** Def missing indices Start ***** ################################################################################################### + if(verbose){ + message(paste("***** Find Missing Indices")) + } + missing_indices <- setNames(lapply(variables, function(var) { #original NAs + na_idx <- which(is.na(data[[var]])) + if (length(na_idx) > 0) return(na_idx) else return(integer(0)) + }), variables) + missing_indices <- missing_indices[!sapply(missing_indices, is.null)] + names(missing_indices) + ### Def missing indices End ### + + po_ohe <- NULL # set ohe to zero, becomes true if ohe is needed + data_new <- copy(data) + original_data <- copy(data) # Saves original structure of data + + if (pred_history == TRUE) { + history <- list() # save history of predicted values + } + + count_tuned_better <- 0 + count_default_better <- 0 + + hyperparameter_cache <- setNames(vector("list", length(variables_NA)), variables_NA) + tuning_status <- setNames(rep(FALSE, length(variables_NA)), variables_NA) + + tuning_log <- list() + + # Iterative Imputation for nseq iterations + for (i in seq_len(nseq)) { + if(verbose){ + message(paste("ITERATION", i, "von", nseq)) + } + iteration_times <- list() + data_prev <- copy(data) + + for (var in variables_NA) { + if(verbose){ + message(paste("***** Impute variable:", var)) + } + var_start_time <- Sys.time() + + data_before <- copy(data) + variables <- checked_data$variables + if(verbose){ + message(paste("***** Select predictors")) + } + if (!isFALSE(formula)) { + selected_formula <- select_formula(formula, var) # formula on left handsite + if (verbose) { + message(paste("Selected formula for variable", var, ":", selected_formula)) + } + } + + ### ***** Formula Extraction Start ***** ################################################################################################### + if (!isFALSE(formula) && (!isFALSE(selected_formula))) { + identified_variables <- identify_variables(selected_formula, data, var) + target_col <- var + feature_cols <- identified_variables$predictor_variables + selected_cols <- c(target_col, feature_cols) + + rewrite_formula <- function(formula, target_variable) { + formula_str <- as.character(formula) + new_formula_str <- paste0(target_variable, " ~ ", formula_str[3]) + as.formula(new_formula_str) + } + + rewrited_formula <- rewrite_formula (selected_formula, target_col) # write formula in the correct way + + # Remove missing values (na.omit) -> for Training + data <- enforce_factor_levels(data, factor_levels) # <--- WICHTIG + data_clean <- na.omit(data) + + # message("factor levels data clean") + # levels_list <- sapply(data_clean, function(col) if (is.factor(col)) levels(col) else NULL) + # message(capture.output(print(levels_list))) + + check_all_factor_levels(data_clean, factor_levels) + + is_target_numeric <- is.numeric(data[[target_col]]) + + if (is_target_numeric) { + task_mm_na_omit <- TaskRegr$new( + id = "imputation_task_na_omit", + backend = data_clean, + target = target_col + ) + } else { + task_mm_na_omit <- TaskClassif$new( + id = "imputation_task_na_omit", + backend = data_clean, + target = target_col + ) + } + + # modelmatrix for x variables + po_mm_na_omit <- PipeOpModelMatrix$new() + po_mm_na_omit$param_set$values$formula <- rewrited_formula + rewrited_formula <- as.formula(paste("~", as.character(rewrited_formula)[3])) + po_mm_na_omit$param_set$values$formula <- rewrited_formula + + mm_task_na_omit <- po_mm_na_omit$train(list(task_mm_na_omit))[[1]] + data_temp <- mm_task_na_omit$data() + data_temp <- as.data.table(data_temp) + + clean_colnames <- function(names) { + names <- make.names(names, unique = TRUE) + patterns <- c("\\(", "\\)", ":", "\\*", "\\^", "%in%", "/", "-", "\\+", " ") + replacements <- c("", "", "_int_", "_cross_", "_pow_", "_nest_", "_sub_", "_minus_", "_plus_", "") + for (i in seq_along(patterns)) { + names <- gsub(patterns[i], replacements[i], names) + } + + return(names) + } + setnames(data_temp, clean_colnames(names(data_temp))) + data_temp <- enforce_factor_levels(data_temp, factor_levels) + check_all_factor_levels(data_temp, factor_levels) + + # message("factor levels data temp") + # levels_list <- sapply(data_temp, function(col) if (is.factor(col)) levels(col) else NULL) + # message(capture.output(print(levels_list))) + + # Impute missing values (Median/Mode) -> for prediction + if (is_target_numeric) { + task_mm <- TaskRegr$new(id = "imputation_task_mm", backend = data, target = target_col) + } else { + task_mm <- TaskClassif$new(id = "imputation_task_mm", backend = data, target = target_col) + } + + pipeline_impute <- po("imputehist") %>>% # Histogram-based imputation for numeric variables (Median) + po("imputemode") %>>% # Mode imputation for categorical variables + po("modelmatrix", formula = rewrited_formula) #rewrited_formula # Create design matrix + + pipeline_impute$train(task_mm) + po_task_mm <- pipeline_impute$predict(task_mm)[[1]] + mm_data <- po_task_mm$data() # mm_data = transformed data with missings filled in, data_temp = transformed data without missings + mm_data <- as.data.table(mm_data) + setnames(mm_data, clean_colnames(names(mm_data))) + mm_data <- enforce_factor_levels(mm_data, factor_levels) + check_all_factor_levels(mm_data, factor_levels) + + + # Identify target transformation + lhs_transformation <- identify_lhs_transformation(selected_formula) # transformations on left handsite + + if (!is.numeric(data_temp[[var]]) && !is.null(lhs_transformation)) { + stop(paste("Error: The target variable must be numeric if a transformation is to be applied. Current class of the target variable:", class(data_temp[[var]]))) + } + + # Create PipeOpTargetTrafo if a transformation is detected + if (!is.null(lhs_transformation)) { + if (lhs_transformation == "exp") { + transformation <- function(x) ifelse(is.na(x), NA, exp(x)) + } else if (lhs_transformation == "log") { + transformation <- function(x) ifelse(is.na(x), NA, log(x)) + } else if (lhs_transformation == "sqrt") { + transformation <- function(x) ifelse(is.na(x), NA, sqrt(x)) + } else if (lhs_transformation == "inverse") { + transformation <- function(x) ifelse(is.na(x), NA, 1 / x) + } else { + stop("Unknown transformation: ", lhs_transformation) + } + + data_temp[[var]] <- transformation(data_temp[[var]]) + mm_data[[var]] <- transformation(mm_data[[var]]) + } + + + ### Formula Extraction End ### + + } else { + lhs_transformation <- NULL + selected_formula <- FALSE + feature_cols <- setdiff(variables, var) + target_col <- var + selected_cols <- c(target_col, feature_cols) + data <- data[, selected_cols, with = FALSE] + data_temp <- as.data.table(data) + data_temp <- enforce_factor_levels(data_temp, factor_levels) + check_all_factor_levels(data_temp, factor_levels) + + # message("factor levels data_temp") + # levels_list <- sapply(data_temp, function(col) if (is.factor(col)) levels(col) else NULL) + # message(capture.output(print(levels_list))) + } + + if ("Intercept" %in% colnames(data_temp)) { + data_temp <- data_temp[, !colnames(data_temp) %in% "Intercept", with = FALSE] + mm_data <- mm_data[, !colnames(mm_data) %in% "Intercept", with = FALSE] + + } + + if (!isFALSE(selected_formula)) { + if (!method[[var]] %in% c("robust", "regularized")) { + stop("Error: A formula can only be used with the 'robust' or 'regularized' methods.") + } + } + + method_var <- method[[var]] + + + + ### ***** Select suitable learner Start ***** ################################################################################################### + if(verbose){ + message(paste("***** Select Learner")) + } + if (is.numeric(data[[target_col]])) { + learners_list <- list( + regularized = list(learners[["regr.cv_glmnet"]], learners[["regr.glmnet"]]), + robust = list(learners[["regr.lm_rob"]]), + ranger = list(learners[["regr.ranger"]]), + xgboost = list(learners[["regr.xgboost"]]) + ) + } else if (is.factor(data[[target_col]])) { + if (method_var == "robust" && length(levels(data[[target_col]])) != 2) { + warning(paste0("Variable not binary", target_col, "': use alternative method than glm.rob")) + } + learners_list <- list( + regularized = list(learners[["classif.glmnet"]]), + robust = list(learners[["classif.glm_rob"]]), + ranger = list(learners[["classif.ranger"]]), + xgboost = list(learners[["classif.xgboost"]]) + ) + } + learner_candidates <- learners_list[[method_var]] + ### Select suitable learner End ***** #### + + ### *****OHE Start***** ################################################################################################### + if(verbose){ + message(paste("***** OHE")) + } + data_temp <- enforce_factor_levels(data_temp, factor_levels) + + needs_ohe <- any(sapply(learner_candidates, function(lrn) { + supports_factors <- "factor" %in% lrn$feature_types + has_factors <- any(sapply(feature_cols, function(col) is.factor(data_temp[[col]]))) + #cat("Learner supports factors:", supports_factors, "\n") + !supports_factors && has_factors + })) + + # if selected formular false --> no ohe needed + if (!isFALSE(selected_formula)) { #model.matrix: does ohe automatically in mle3 + #cat("Selected formula exists, no OHE needed.\n") + needs_ohe <- FALSE + } + if(verbose){ + cat("needs_ohe:", needs_ohe, "\n") + } + #CONDITION FOR OHE + # (1) At least one learner in learner_candidates does not support factors. + # (2) At least one of the feature columns (feature_cols) is a factor. + # Check whether the target is a categorical variable (factor) or numeric + if (is.factor(data_temp[[target_col]])) { + task_type <- "classif" + } else { + task_type <- "regr" + } + + if (needs_ohe) { + #print("One-Hot-Encoding notwendig") + po_ohe <- po("encode", method = "one-hot") + + # OHE on data + if (task_type == "regr") { + train_task <- as_task_regr(data_temp, target = target_col) + } else { + train_task <- as_task_classif(data_temp, target = target_col) + } + + po_ohe$train(list(train_task)) # Train Encoder + + # Apply the encoding to the training data + data_temp <- po_ohe$predict(list(train_task))[[1]]$data() + } + + # message("factor levels data temp") + # levels_list <- sapply(data_temp, function(col) if (is.factor(col)) levels(col) else NULL) + # message(capture.output(print(levels_list))) + + + ### OHE End ### + + ### *****Create task Start***** ################################################################################################### + if(verbose){ + message(paste("***** Create task")) + } + data_y_fill <- copy(data_temp) + supports_missing <- all(sapply(learner_candidates, function(lrn) "missings" %in% lrn$properties)) + + if (method_var == "ranger") { + supports_missing <- FALSE + } + + # If NA in target variable --> only train with the data that has no NA in Y + data_y_fill <- data_y_fill[!is.na(get(target_col))] + + # If the learner does not support missing values -> use na.omit() + data_y_fill_final <- if (supports_missing) data_y_fill else na.omit(data_y_fill) + data_y_fill_final <- enforce_factor_levels(data_y_fill_final, factor_levels) + + + # Create task + if (is.numeric(data_y_fill_final[[target_col]])) { + task <- TaskRegr$new(id = target_col, backend = data_y_fill_final, target = target_col) + } else if (is.factor(data_y_fill_final[[target_col]])) { + task <- TaskClassif$new(id = target_col, backend = data_y_fill_final, target = target_col) + } else { + stop("Mistake: Target variable is neither numerical nor a factor!") + } + + ### *****Create Learner Start***** ################################################################################################### + if(verbose){ + message(paste("***** Create Learner")) + } + max_threads <- future::availableCores() -1 + if (nrow(data_y_fill_final) < 10000) { + optimal_threads <- 1 + } else if (nrow(data_y_fill_final) < 100000) { + optimal_threads <- max(1, max_threads %/% 2) + } else { + optimal_threads <- max_threads + } + # XGBoost Parameter + xgboost_params <- list( + nrounds = 100, + max_depth = 3, + eta = 0.1, + min_child_weight = 1, + subsample = 1, + colsample_bytree = 1, + #tree_method = "hist", + #early_stopping_rounds = 10, + verbose = 1, + nthread = optimal_threads + ) + if(verbose){ + print(paste("nthread is set to:", optimal_threads)) + } + # Ranger Parameter + ranger_params <- list( + num.trees = 500, + num.threads = 4 + ) + + if (length(learner_candidates) > 1) { + resample_results <- lapply(learner_candidates, function(lrn) { + + if (grepl("xgboost", lrn$id)) { + lrn$param_set$values <- modifyList(lrn$param_set$values, xgboost_params) + } else if (grepl("ranger", lrn$id)) { + lrn$param_set$values <- modifyList(lrn$param_set$values, ranger_params) + } + resample(task, lrn, rsmp("cv", folds = 5)) + }) + + mse_values <- sapply(resample_results, function(res) res$aggregate(msr("regr.rmse"))) + best_learner <- learner_candidates[[which.min(mse_values)]] + + } else { + best_learner <- learner_candidates[[1]] + } + + # Initialize learner and set parameters + learner_obj <- lrn(best_learner$id) + default_learner <- learner_obj$clone(deep = TRUE) + current_learner <- learner_obj$clone(deep = TRUE) + best_learner <- learner_obj$clone(deep = TRUE) + tuned_learner <- learner_obj$clone(deep = TRUE) + + # Set parameters for the best learner + if (grepl("xgboost", best_learner$id)) { + best_learner$param_set$values <- modifyList(best_learner$param_set$values, xgboost_params) + default_learner$param_set$values <- modifyList(default_learner$param_set$values, xgboost_params) + current_learner$param_set$values <- modifyList(current_learner$param_set$values, xgboost_params) + tuned_learner$param_set$values <- modifyList(tuned_learner$param_set$values, xgboost_params) + } else if (grepl("ranger", best_learner$id)) { + best_learner$param_set$values <- modifyList(best_learner$param_set$values, ranger_params) + default_learner$param_set$values <- modifyList(default_learner$param_set$values, ranger_params) + current_learner$param_set$values <- modifyList(current_learner$param_set$values, ranger_params) + tuned_learner$param_set$values <- modifyList(tuned_learner$param_set$values, ranger_params) + } + + if (is.factor(data_temp[[target_col]])) { + best_learner$predict_type <- "prob" + default_learner$predict_type <- "prob" + current_learner$predict_type <- "prob" + tuned_learner$predict_type <- "prob" + } + + ### Create Learner End ### + + ### *****Hyperparameter Start***** ################################################################################################### + if(verbose){ + message(paste("***** Parametertuning")) + } + + if (!tuning_status[[var]] && nseq >= 2 && tune) { + + if ((nseq > 2 && i == round(nseq / 2)) || (nseq == 2 && i == 2)) { + #print("Starte Hyperparameter-Tuning") + tuner = tnr("random_search") + p = length(task$feature_names) + + search_spaces <- list( + "regr.cv_glmnet" = ps(alpha = p_dbl(0, 1), nfolds = p_int(7, 20)), + "regr.glmnet" = ps(alpha = p_dbl(0, 1), lambda = p_dbl(10^-4, 10^2, logscale = TRUE)), + "classif.glmnet" = ps(alpha = p_dbl(0, 1), lambda = p_dbl(10^-4, 10^2, logscale = TRUE)), + # alpha = mixture between lasso (1) and ridge (0) + # lambda = the larger the stronger the regulation + + # Ranger + "regr.ranger" = ps(num.trees = p_int(500, 700 ,default = 500), min.node.size = p_int(3, 10, default=5), sample.fraction = p_dbl(0.8,1)), + "classif.ranger" = ps(num.trees = p_int(500, 700), min.node.size = p_int(3, 10), sample.fraction = p_dbl(0.8,1)), + # min.node.size = minimum node size + # sample.fraction = proportion of data sampled per tree + + # XGBoost + "regr.xgboost" = ps(nrounds = p_int(100,500), eta = p_dbl(0.01, 0.3), max_depth = p_int(3, 9), colsample_bytree = p_dbl(0.7, 0.9)), + "classif.xgboost" = ps(nrounds = p_int(100,500), eta = p_dbl(0.01, 0.3), max_depth = p_int(3, 9), subsample = p_dbl(0.7, 0.9), colsample_bytree = p_dbl(0.7, 0.9)), + # eta = learning rate, low values --> more stable but slower models + # max_depth = Maximum tree depth, large values --> more complex patterns but possibly overfitting + # subsample = proportion of data used per boosting iteration | colsample_bytree = proportion of features used per tree + + # Robust Models + "regr.lm_rob" = ps(tuning.chi = p_dbl(1.2,1.8), tuning.psi = p_dbl(1.2,1.8), max.it = p_int(60,300)), #psi = p_fct(c("bisquare", "optimal")), + "classif.glm_rob" = ps(method = p_fct(c("Mqle", "WBY")), acc = p_dbl(0, 0.1), test.acc = p_fct(c("coef", "resid")), tcc = p_dbl(1,2), maxit = p_int(30,300)) + # psi = function for weighting the residuals + # acc = tolerance for the convergence of the algorithm | test.acc = criterion for the convergence test | tcc = tuning constant + ) + + best_learner_id = best_learner$id + #best_learner = learners[[best_learner_id]] + search_space = search_spaces[[best_learner_id]] + + #future::plan("multisession") + + tryCatch({ + # train default model + if (best_learner_id == "classif.xgboost" || best_learner_id == "regr.xgboost") { + default_learner$param_set$values$nrounds = 100 # Set a default value for nrounds + } + + resampling = rsmp("cv", folds = 5) + resampling$instantiate(task) + + # Tuning-Instance + instance = TuningInstanceBatchSingleCrit$new( + task = task, + learner = best_learner, + resampling = resampling, + # resampling = rsmp("cv", folds = 5,repeats = 3), + measure = if (task$task_type == "regr") msr("regr.rmse") else msr("classif.acc"), + search_space = search_space, + terminator = trm("evals", n_evals = 20) + ) + + # tuning + tuner <- tnr("random_search", batch_size = parallel::detectCores() - 1) + tuner$optimize(instance) + + # save best parameters + best_params <- as.list(instance$result[, get("learner_param_vals")][[1]]) + tuning_status[[var]] <- TRUE + + # compare with default + tuned_learner$param_set$values <- best_params + + resampling1 = rsmp("cv", folds = 5) + resampling1$instantiate(task) + default_result <- resample(task, default_learner, resampling1) + + resampling2 = rsmp("cv", folds = 5) + resampling2$instantiate(task) + tuned_result <- resample(task, tuned_learner, resampling2) + + # which model is better + if (task$task_type == "regr") { + if (tuned_result$aggregate(msr("regr.rmse")) < default_result$aggregate(msr("regr.rmse"))) { + current_learner$param_set$values <- best_params + count_tuned_better <- count_tuned_better + 1 + + hyperparameter_cache[[var]] <- list( + params = best_params, + is_tuned = TRUE + ) + + if (verbose) { + cat(sprintf("Tuned parameters for variable '%s': %s\n", var, paste(names(best_params), best_params, sep = "=", collapse = ", "))) + flush.console() + } + + } else { + current_learner$param_set$values <- default_learner$param_set$values + count_default_better <- count_default_better + 1 + hyperparameter_cache[[var]] <- list( + params = default_learner$param_set$values, + is_tuned = FALSE + ) + if (verbose) { + cat(sprintf("Default parameters for variable '%s': %s", var, paste(names(default_learner$param_set$values), default_learner$param_set$values, sep = "=", collapse = ", "))) + flush.console() + } + + } + + } else { + if (tuned_result$aggregate(msr("classif.acc")) > default_result$aggregate(msr("classif.acc"))) { + current_learner$param_set$values <- best_params + count_tuned_better <- count_tuned_better + 1 + + hyperparameter_cache[[var]] <- list( + params = best_params, + is_tuned = TRUE + ) + if (verbose) { + cat(sprintf("Tuned parameters for variable '%s': %s\n", var, paste(names(best_params), best_params, sep = "=", collapse = ", "))) + flush.console() + } + + } else { + current_learner$param_set$values <- default_learner$param_set$values + count_default_better <- count_default_better + 1 + hyperparameter_cache[[var]] <- list( + params = default_learner$param_set$values, + is_tuned = FALSE + ) + if (verbose) { + cat(sprintf("Default parameters for variable '%s': %s", var, paste(names(default_learner$param_set$values), default_learner$param_set$values, sep = "=", collapse = ", "))) + flush.console() + } + } + } + + }, error = function(e) { + warning(sprintf("Tuning failed for variable '%s': %s. Using default parameters.", var, e$message)) + current_learner$param_set$values <- default_learner$param_set$values + tuning_status[[var]] <- FALSE + hyperparameter_cache[[var]] <- list( + params = default_learner$param_set$values, + is_tuned = FALSE + ) + }) + + future::plan("sequential") + } else if (tuning_status[[var]]) { + # Use cached parameters + current_learner$param_set$values <- hyperparameter_cache[[var]]$params + } + } else { + # No tuning - use default parameters + current_learner$param_set$values <- list() + } + + # tuning_log + tuning_log[[length(tuning_log) + 1]] <- list( + variable = var, + tuned_better = isTRUE(hyperparameter_cache[[var]]$is_tuned) + ) + + if (verbose) { + #print tuning_log + if (length(tuning_log) > 0) { + print("Tuning Log:") + print(tuning_log) + } + } + ### Hyperparameter End ### + + ### ***** NAs Start***** ################################################################################################### + if(verbose){ + message(paste("***** NAs in feature variables bearbeiten")) + } + if (method_var == "xgboost") { + po_x_miss <- NULL # No modification necessary as xgboost accepts missings as NA + + } else if (supports_missing) { + if (sum(is.na(data_temp)) > 0) { #task$data_temp() + po_x_miss <- po("missind", param_vals = list( + affect_columns = mlr3pipelines::selector_all(), + which = "all", + type = "factor" + )) + } else { + po_x_miss <- NULL + } + + } + ### NAs End ### + + ### *****Train Model Start***** ################################################################################################### + if(verbose){ + message("***** Train Model") + } + + po_fixfactors <- po("fixfactors") + + # Check semicontinous + is_sc <- is_semicontinuous(data_temp[[var]]) + + if (is_sc) { + + # 1) Classification: 0 or > 0 + zero_flag_col <- paste0(var, "_zero_flag") + + data_temp[[zero_flag_col]] <- factor(ifelse(data_temp[[var]] == 0, "zero", "positive")) + relevant_features <- setdiff(names(data_temp), c(var, zero_flag_col)) + class_data <- data_temp[!is.na(data_temp[[var]]) & complete.cases(data_temp[, ..relevant_features]),] + train_data <- class_data + train_data <- enforce_factor_levels(train_data , factor_levels) + + #message("levels in train data") + #levels_list <- sapply(train_data, function(col) if (is.factor(col)) levels(col) else NULL) + #message(capture.output(print(levels_list))) + + feature_cols <- setdiff(names(train_data), c(var, zero_flag_col)) + + # classification task + class_task <- TaskClassif$new( + id = paste0(zero_flag_col, "_task"), + backend = train_data, + target = zero_flag_col + ) + class_task$select(setdiff(names(train_data), c(var, zero_flag_col))) + + + # learner + regr_learner_id <- best_learner$id + classif_learner <- lrn("classif.log_reg") + regr_learner <- lrn(regr_learner_id) + if (grepl("xgboost", best_learner$id)) { + regr_learner$param_set$values <- modifyList(regr_learner$param_set$values, xgboost_params) + } else if (grepl("ranger", best_learner$id)) { + regr_learner$param_set$values <- modifyList(regr_learner$param_set$values, ranger_params) + } + + # support missings? + supports_missing_classif <- "missings" %in% classif_learner$properties + if (method_var == "ranger") supports_missing_classif <- FALSE + + + # if support missings + po_x_miss_classif <- NULL + if (sum(is.na(data_temp[[var]])) > 0 && supports_missing_classif && method_var != "xgboost") { + po_x_miss_classif <- po("missind", param_vals = list( + affect_columns = mlr3pipelines::selector_all(), + which = "all", + type = "factor" + )) + } + + class_pipeline <- if (!is.null(po_x_miss_classif)) po_x_miss_classif %>>% classif_learner else classif_learner + # class_pipeline <- po_fixfactors %>>% class_pipeline + class_learner <- GraphLearner$new(class_pipeline) + + # Hyperparameter-Cache for classification + if (isTRUE(tuning_status[[var]]) && !is.null(tuning_status[[zero_flag_col]]) && isTRUE(tuning_status[[zero_flag_col]])) { + if (!is.null(hyperparameter_cache[[zero_flag_col]]) && isTRUE(hyperparameter_cache[[zero_flag_col]]$is_tuned)) { + params <- hyperparameter_cache[[zero_flag_col]]$params + + # without prefix + pipeline_valid <- intersect(names(params), class_pipeline$param_set$ids()) + class_pipeline$param_set$values <- modifyList(class_pipeline$param_set$values, params[pipeline_valid]) + + # with prefix + prefixed_names <- paste0(best_learner$id, ".", names(params)) + learner_valid <- prefixed_names %in% class_learner$param_set$ids() + if (any(learner_valid)) { + prefixed_params <- setNames(params[learner_valid], prefixed_names[learner_valid]) + class_learner$param_set$values <- modifyList(class_learner$param_set$values, prefixed_params) + } + + # warning if missing variables + missing_in_pipeline <- setdiff(names(params), class_pipeline$param_set$ids()) + missing_in_learner <- setdiff(names(params), + sub(paste0("^", best_learner$id, "\\."), "", + class_learner$param_set$ids()[startsWith(class_learner$param_set$ids(), best_learner$id)])) + if (length(missing_in_pipeline) > 0) warning("Missing in Pipeline (classification): ", paste(missing_in_pipeline, collapse = ", ")) + if (length(missing_in_learner) > 0) warning("Missing in Learner (classification): ", paste(missing_in_learner, collapse = ", ")) + } + } + + # Predict-Type classification + if ("prob" %in% class_learner$predict_types) { + class_learner$predict_type <- "prob" + } else { + class_learner$predict_type <- "response" + warning(sprintf("predict_type 'prob' not supported by learner '%s'; fallback to 'response'", class_learner$id)) + } + + # train classificationmodel + class_learner$train(class_task) + + + # 2) Regression + reg_data <- data_temp[data_temp[[var]] > 0,] + reg_features <- setdiff(names(reg_data), c(var, zero_flag_col)) + reg_data <- reg_data[!is.na(reg_data[[var]]),] #only without NA + reg_data <- enforce_factor_levels(reg_data, factor_levels) + has_na_in_features <- anyNA(reg_data[, ..reg_features]) + + #message("levels in reg data") + #levels_list <- sapply(reg_data, function(col) if (is.factor(col)) levels(col) else NULL) + #message(capture.output(print(levels_list))) + + # support missings? + supports_missing <- "missings" %in% regr_learner$properties + if (method_var == "ranger") supports_missing <- FALSE + + # Missings as Indikator-Variables + po_x_miss_reg <- NULL + if (has_na_in_features && supports_missing && method_var != "xgboost") { + po_x_miss_reg <- po("missind", param_vals = list( + affect_columns = mlr3pipelines::selector_all(), + which = "all", + type = "factor" + )) + #po_x_miss_reg <- po_fixfactors %>>% po_x_miss_reg + } + + # Fallback: if learner canot handle missings + if (has_na_in_features && !supports_missing) { + cols <- c(reg_features, var) + reg_data <- na.omit(reg_data[, ..cols]) + reg_data <- enforce_factor_levels(reg_data, factor_levels) + check_all_factor_levels(reg_data, factor_levels) + } + + #message("levels in reg data") + #levels_list <- sapply(reg_data, function(col) if (is.factor(col)) levels(col) else NULL) + #message(capture.output(print(levels_list))) + + # Task + reg_task <- TaskRegr$new(id = var, backend = reg_data, target = var) + reg_task$select(reg_features) + + # Pipeline + reg_pipeline <- if (!is.null(po_x_miss_reg)) po_x_miss_reg %>>% regr_learner else regr_learner + # reg_pipeline <- po_fixfactors %>>% reg_pipeline + reg_learner <- GraphLearner$new(reg_pipeline) + + # Hyperparameter-Cache + if (isTRUE(tuning_status[[var]]) && !is.null(tuning_status[[zero_flag_col]]) && isTRUE(tuning_status[[zero_flag_col]])) { + if (!is.null(hyperparameter_cache[[var]]) && isTRUE(hyperparameter_cache[[var]]$is_tuned)) { + params <- hyperparameter_cache[[var]]$params + if (verbose) { + cat(sprintf("Use optimized parameters from the cache for %s\n", var)) + } + + pipeline_valid <- intersect(names(params), reg_pipeline$param_set$ids()) + reg_pipeline$param_set$values <- modifyList(reg_pipeline$param_set$values, params[pipeline_valid]) + + prefixed_names <- paste0(best_learner$id, ".", names(params)) + learner_valid <- prefixed_names %in% reg_learner$param_set$ids() + if (any(learner_valid)) { + prefixed_params <- setNames(params[learner_valid], prefixed_names[learner_valid]) + reg_learner$param_set$values <- modifyList(reg_learner$param_set$values, prefixed_params) + } + + missing_in_pipeline <- setdiff(names(params), reg_pipeline$param_set$ids()) + missing_in_learner <- setdiff(names(params), + sub(paste0("^", best_learner$id, "\\."), "", + reg_learner$param_set$ids()[startsWith(reg_learner$param_set$ids(), best_learner$id)])) + if (length(missing_in_pipeline) > 0) warning("Missing in Pipeline (regression): ", paste(missing_in_pipeline, collapse = ", ")) + if (length(missing_in_learner) > 0) warning("Missing in Learner (regression): ", paste(missing_in_learner, collapse = ", ")) + } + } + + # Train regressionmodel + reg_learner$train(reg_task) + + # save models + learner <- list(classifier = class_learner, regressor = reg_learner) + + # if not semicontinous + } else { + + # Basis-Pipeline with best learner + full_pipeline <- current_learner + if (grepl("xgboost", best_learner$id)) { + current_learner$param_set$values <- modifyList(current_learner$param_set$values, xgboost_params) + } else if (grepl("ranger", best_learner$id)) { + current_learner$param_set$values <- modifyList(current_learner$param_set$values, ranger_params) + } + + # Handling of missing values + if (method_var != "xgboost" && supports_missing && !is.null(po_x_miss)) { #xgboost can handle NAs directly, support_missings are learners that can handle missings if they are marked as such + full_pipeline <- po_x_miss %>>% full_pipeline + } + # full_pipeline <- po_fixfactors %>>% full_pipeline + + # create and train graphLearner + learner <- GraphLearner$new(full_pipeline) + + if (isTRUE(tuning_status[[var]])) { + # if tuning was done + if (!is.null(hyperparameter_cache[[var]]) && isTRUE(hyperparameter_cache[[var]]$is_tuned)) { + # If optimized parameters exist in the cache + params <- hyperparameter_cache[[var]]$params + if (verbose) { + cat(sprintf("Use optimized parameters from the cache for %s\n", var)) + } + + # Set parameter in full_pipeline (without prefix) + pipeline_valid <- intersect(names(params), full_pipeline$param_set$ids()) + full_pipeline$param_set$values <- modifyList( + full_pipeline$param_set$values, + params[pipeline_valid] + ) + + # Set parameters in GraphLearner (with prefix) + prefixed_names <- paste0(best_learner$id, ".", names(params)) + learner_valid <- prefixed_names %in% learner$param_set$ids() + + if (any(learner_valid)) { + prefixed_params <- setNames(params[learner_valid], prefixed_names[learner_valid]) + learner$param_set$values <- modifyList( + learner$param_set$values, + prefixed_params + ) + } + + # Warning for non existent parameters + missing_in_pipeline <- setdiff(names(params), full_pipeline$param_set$ids()) + missing_in_learner <- setdiff(names(params), + sub(paste0("^", best_learner$id, "\\."), "", + learner$param_set$ids()[startsWith(learner$param_set$ids(), best_learner$id)])) + + if (length(missing_in_pipeline) > 0) { + warning("Missing in Pipeline: ", paste(missing_in_pipeline, collapse = ", ")) + } + if (length(missing_in_learner) > 0) { + warning("Missing in Learner: ", paste(missing_in_learner, collapse = ", ")) + } + + } + } + + # Set predict type for classification problems + if (exists("target_col") && is.factor(data_temp[[target_col]])) { + learner$predict_type <- "prob" + } + + learner$train(task) + + } + ### Train Model End ### + + ### *****Identify NAs Start***** ################################################################################################### + if(verbose){ + message("***** Identidy missing values *****") + } + + # impute missing values + impute_missing_values <- function(data, ref_data) { + for (col in colnames(data)) { + if (any(is.na(data[[col]]))) { + if (is.numeric(ref_data[[col]])) { + data[[col]][is.na(data[[col]])] <- median(ref_data[[col]], na.rm = TRUE) # Median + } else if (is.factor(ref_data[[col]])) { + mode_value <- names(which.max(table(ref_data[[col]], useNA = "no"))) # Modus + data[[col]][is.na(data[[col]])] <- mode_value + } + } + } + return(data) + } + + # missing indices + missing_idx <- missing_indices[[var]] + if (length(missing_idx) == 0) { + next + } + + if (!is_sc) { + # not semicontinous + + variables <- colnames(data_temp) + feature_cols <- setdiff(variables, var) + + if (!isFALSE(selected_formula)) { + backend_data <- mm_data[missing_idx, ] + backend_data <- enforce_factor_levels(backend_data, factor_levels) + + # backend_data <- set_new_levels_to_na(backend_data, factor_levels, data_y_fill_final, method_var) + + # Impute if NA in backend_data + if (any(is.na(backend_data))) { + backend_data <- impute_missing_values(backend_data, data_temp) + } + check_all_factor_levels(backend_data, factor_levels) + + + } else { + # without formula + backend_cols <- union(feature_cols, var) + backend_data <- data_temp[missing_idx, backend_cols, with = FALSE] + backend_data <- enforce_factor_levels(backend_data, factor_levels) + check_all_factor_levels(backend_data, factor_levels) + # backend_data <- set_new_levels_to_na(backend_data, factor_levels, data_y_fill_final, method_var) + + if (!supports_missing) { + backend_data <- impute_missing_values(backend_data, data_y_fill) + } + } + + # message("levels in backend data") + # levels_list <- sapply(backend_data, function(col) if (is.factor(col)) levels(col) else NULL) + # message(capture.output(print(levels_list))) + + } else { + # semicontinous + zero_flag_col <- paste0(var, "_zero_flag") + variables <- colnames(data_temp) + feature_cols <- setdiff(variables, c(var, zero_flag_col)) + + print("feature_cols") + print(feature_cols) + + if (!isFALSE(selected_formula)) { + class_pred_data <- mm_data[missing_idx,] + class_pred_data <- enforce_factor_levels(class_pred_data, factor_levels) + # class_pred_data <- set_new_levels_to_na(class_pred_data, factor_levels, data_y_fill_final, method_var) + if (anyNA(class_pred_data)) { + class_pred_data <- impute_missing_values(class_pred_data, data_temp) + } + + # message("levels in class pred data") + # levels_list <- sapply(class_pred_data, function(col) if (is.factor(col)) levels(col) else NULL) + # message(capture.output(print(levels_list))) + + } else { + class_pred_data <- data_temp[missing_idx, c(feature_cols, zero_flag_col), with = FALSE] + class_pred_data <- enforce_factor_levels(class_pred_data, factor_levels) + # class_pred_data <- set_new_levels_to_na(class_pred_data, factor_levels, data_y_fill_final, method_var) + if (!supports_missing && anyNA(class_pred_data)) { + class_pred_data <- impute_missing_values(class_pred_data, data_temp) + } + + } + reg_pred_data <- data_temp[data_temp[[var]] > 0, ] + reg_pred_data <- enforce_factor_levels(reg_pred_data, factor_levels) + # reg_pred_data <- set_new_levels_to_na(reg_pred_data, factor_levels, data_y_fill_final, method_var) + if (!supports_missing && anyNA(reg_pred_data)) { + reg_pred_data <- impute_missing_values(reg_pred_data, data_temp) + } + + # message("levels in reg pred data") + # levels_list <- sapply(reg_pred_data, function(col) if (is.factor(col)) levels(col) else NULL) + # message(capture.output(print(levels_list))) + + } + ### Identify NAs End ### + + ### *****Select suitable task type Start***** ################################################################################################### + + if (!is_sc) { + # message("levels in reg backend data") + # levels_list <- sapply(backend_data, function(col) if (is.factor(col)) levels(col) else NULL) + # message(capture.output(print(levels_list))) + + if (is.numeric(data_temp[[target_col]])) { + pred_task <- TaskRegr$new( + id = target_col, + backend = backend_data, + target = target_col + ) + } else if (is.factor(data_temp[[target_col]])) { + pred_task <- TaskClassif$new( + id = target_col, + backend = backend_data, + target = target_col + ) + } else { + stop("Error: Target variable is neither numeric nor a factor!") + } + } + + ### Select suitable task type End #### + + ### *****Predict Start***** ################################################################################################### + if(verbose){ + message(paste("***** Predict")) + } + + + # helper: inverse Transformation + inverse_transform <- function(x, method) { + switch(method, + exp = log(x), + log = exp(x), + sqrt = x^2, + inverse = 1 / x, + stop("Unknown transformation: ", method) + ) + } + + # helper decimal places + get_decimal_places <- function(x) { + if (is.na(x)) return(0) + if (x == floor(x)) return(0) + nchar(sub(".*\\.", "", as.character(x))) + } + + if (is_sc) { + zero_flag_col <- paste0(var, "_zero_flag") + variables <- colnames(data_temp) + feature_cols <- setdiff(variables, c(var, zero_flag_col)) + + # 1) classification (null vs positive) + class_learner <- learner$classifier + + # no NAs + class_pred_data <- data_temp[missing_idx, feature_cols, with = FALSE] + + # Prediction without Task (weil Zielvariable nicht vorhanden) + class_pred_data <- enforce_factor_levels(class_pred_data, factor_levels) + + # message("levels in class pred data") + # levels_list <- sapply(class_pred_data, function(col) if (is.factor(col)) levels(col) else NULL) + # message(capture.output(print(levels_list))) + + + check_all_factor_levels(class_pred_data, factor_levels) + # class_pred_data <- set_new_levels_to_na(class_pred_data, factor_levels, data_y_fill_final, method_var) + if (anyNA(class_pred_data)) { + class_pred_data <- impute_missing_values(class_pred_data, data_temp) + } + pred_probs <- class_learner$predict_newdata(class_pred_data)$prob + + # predict + if (isFALSE(sequential) || i == nseq) { + preds_class <- apply(pred_probs, 1, function(probs) { + sample(colnames(pred_probs), size = 1, prob = probs) + }) + } else { + preds_class <- colnames(pred_probs)[max.col(pred_probs)] + } + + levels_zero_flag <- levels(data_temp[[zero_flag_col]]) + data_temp[[zero_flag_col]][missing_idx] <- ifelse(preds_class == "positive", + levels_zero_flag[levels_zero_flag == "positive"], + levels_zero_flag[levels_zero_flag == "zero"]) + + # 2) regression: for positive predictions + reg_learner <- learner$regressor + + reg_rows <- missing_idx[which(data_temp[[zero_flag_col]][missing_idx] == "positive")] + if (length(reg_rows) > 0) { + reg_pred_data <- data_temp[reg_rows, feature_cols, with = FALSE] + + reg_pred_data <- enforce_factor_levels(reg_pred_data, factor_levels) + check_all_factor_levels(reg_pred_data, factor_levels) + + # message("levels in reg pred data") + # levels_list <- sapply(reg_pred_data, function(col) if (is.factor(col)) levels(col) else NULL) + # message(capture.output(print(levels_list))) + + # reg_pred_data <- set_new_levels_to_na(reg_pred_data, factor_levels, data_y_fill_final, method_var = method_var) + if (anyNA(reg_pred_data)) { + reg_pred_data <- impute_missing_values(reg_pred_data, data_temp[reg_rows]) + } + + preds_reg <- reg_learner$predict_newdata(reg_pred_data)$response + } else { + preds_reg <- numeric(0) + } + + preds <- data_temp[[var]] + preds[missing_idx] <- 0 + preds[reg_rows] <- preds_reg + preds <- preds[missing_idx] + + print ("preds") + print (preds) + + + } else { + # not semicontinous + if (anyNA(backend_data[, ..feature_cols])) { + warning("NAs present in backend_data before Task creation - did fixfactors create new NAs?") + print(which(sapply(backend_data[, ..feature_cols], function(col) anyNA(col)))) + } + + bdt <- as.data.table(backend_data) + bdt <- enforce_factor_levels(bdt, factor_levels) + check_all_factor_levels(bdt, factor_levels) + + # message("levels in backend data/ bdt") + # levels_list <- sapply(bdt, function(col) if (is.factor(col)) levels(col) else NULL) + # message(capture.output(print(levels_list))) + + + if (anyNA(bdt)) { + bdt <- impute_missing_values(bdt, data_temp) + print("impute_missings_before_pred") + } + # bdt <- enforce_factor_levels(bdt, factor_levels) + check_factor_levels(bdt, factor_levels) + backend_data <- mlr3::as_data_backend(bdt) + + if (is.factor(data_temp[[target_col]])) { + pred_task <- TaskClassif$new( + id = target_col, + backend = backend_data, + target = target_col + ) + } else if (is.numeric(data_temp[[target_col]])) { + pred_task <- TaskRegr$new( + id = target_col, + backend = backend_data, + target = target_col + ) + } else { + stop("Target variable must be factor or numeric!") + } + + if (is.factor(data_temp[[target_col]])) { + + mod <- switch(method_var, + ranger = "classif.ranger", + xgboost = "classif.xgboost", + regularized = "classif.glmnet", + robust = "classif.glm_rob", + stop("Unknown method for classification:", method_var)) + + learner$model[[mod]]$param_set$values$predict_type <- "prob" + + pred_probs <- learner$predict(pred_task)$prob + if (is.null(pred_probs)) { + stop("Error while calculating probabilities.") + } + + if (isFALSE(sequential) || i == nseq) { + preds <- apply(pred_probs, 1, function(probs) { + sample(levels(data_temp[[target_col]]), size = 1, prob = probs) + }) + } else { + preds <- apply(pred_probs, 1, which.max) + preds <- levels(data_temp[[target_col]])[preds] + } + + } else { + preds <- learner$predict(pred_task)$response + } + } + + if (!is.null(lhs_transformation)) { + preds <- inverse_transform(preds, lhs_transformation) + } + + if (inherits(preds, "numeric")) { + decimal_places <- max(sapply(na.omit(data[[var]]), get_decimal_places), na.rm = TRUE) + preds <- round(preds, decimal_places) + } + ### Predict End ################################################################################################### + + ### *****PMM Start***** ################################################################################################### + if(verbose){ + message(paste("***** pmm for predictions")) + } + if (pmm[[var]] && is.numeric(data_temp[[var]])) { + if(verbose){ + print(paste("PMM is applied to", var, ".")) + } + + # True observed values from the original data + observed_values <- na.omit(data[[var]]) + + # Find the next true value for each prediction + preds <- sapply(preds, function(x) { + observed_values[which.min(abs(observed_values - x))] + }) + } + ### PMM End ### + + ### *****Prediction History Start***** ################################################################################################### + if(verbose){ + message(paste("***** Predict History")) + } + if (pred_history == TRUE) { + history[[paste0(var, "_iter", i)]] <- data.table( + iteration = i, + variable = var, + index = missing_idx, + predicted_values = preds + ) + } + ### Prediction History End ### + + ### *****Replace missing values with predicted values Start***** ################################################################################################### + if(verbose){ + message(paste("***** Replace values with new predictions")) + } + + if (length(missing_idx) > 0) { + if (is.numeric(data_prev[[var]])) { + data[missing_idx, (var) := as.numeric(preds)] + } else if (is.factor(data[[var]])) { + data[missing_idx, (var) := factor(preds, levels = factor_levels[[var]])] + } else { + stop(paste("Unknown data type for variable:", var)) + } + data <- copy(data) + } + ### Replace missing values with predicted values Start End ### + + ### *****Import Variable Start***** ################################################################################################### + if(verbose){ + message(paste("***** Import Variable (imp_var = TRUE)")) + } + if (length(missing_idx) > 0) { + if (imp_var) { + imp_col <- paste0(var, "_imp") # Name _imp-Spalte + + # Check whether the variable contains missing values using `missing_idx`. + if (!is.null(missing_idx) && length(missing_idx) > 0) { + # If `_imp` column does not exist, create it as a boolean variable + if (!(imp_col %in% colnames(data_new))) { + data_new[, (imp_col) := FALSE] + } + + data_new[, (var) := data[[var]]] + + # Ensure that `preds` is not NULL or empty + if (length(preds) == length(missing_idx) && !all(is.na(preds))) { + # Set the imputation as TRUE for missing values + set(data_new, i = missing_idx, j = imp_col, value = TRUE) + } else { + warning(paste("Warning: `preds` is empty or does not have the same length as `missing_idx` for ", var)) + } + } + } + } + var_end_time <- Sys.time() + var_time <- difftime(var_end_time, var_start_time, units = "secs") + iteration_times[[var]] <- round(as.numeric(var_time), 2) + if(verbose){ + message(paste("time used for", var, ":", iteration_times[[var]], "Sekunden")) + } + } + ### Import Variable END ### + + ### *****Stop Criteria Start***** ################################################################################################### + if (sequential && i != 1) { + is_dt <- inherits(data, "data.table") + + # Automatic detection of the variable types + numeric_cols <- names(data)[sapply(data, is.numeric)] + factor_cols <- names(data)[sapply(data, is.factor)] + + # Calculation of numerical differences + if (length(numeric_cols) > 0) { + epsilon <- 1e-8 # Avoid division by zero + + if (is_dt) { + # Calculate standard deviation of each column in data_prev + std_dev <- sapply(data_prev[, mget(numeric_cols)], sd, na.rm = TRUE) + + # Calculate normalized relative change + num_diff <- sum(abs(data[, mget(numeric_cols)] - data_prev[, mget(numeric_cols)]) / + (std_dev + epsilon), na.rm = TRUE) + } else { + std_dev <- sapply(data_prev[, numeric_cols, drop = FALSE], sd, na.rm = TRUE) + num_diff <- sum(abs(data[, numeric_cols, drop = FALSE] - data_prev[, numeric_cols, drop = FALSE]) / + (std_dev + epsilon), na.rm = TRUE) + } + } else { + num_diff <- 0 + } + + + if (length(factor_cols) > 0) { + if (is_dt) { + # Calculate number of changed values + cat_changes <- sum(data[, factor_cols, with = FALSE] != data_prev[, factor_cols, with = FALSE], na.rm = TRUE) + + # Calculate total number of categorical values + total_cat_values <- sum(!is.na(data_prev[, factor_cols, with = FALSE])) + } else { + # If 'data' is data.frame + cat_changes <- sum(data[factor_cols] != data_prev[factor_cols], na.rm = TRUE) + total_cat_values <- sum(!is.na(data_prev[factor_cols])) + } + + # Calculate normalized rate of change + cat_diff <- cat_changes / (total_cat_values + 1e-8) # +epsilon to avoid division by zero + } else { + cat_diff <- 0 # If there are no categorical columns, the difference is 0 + } + + # Calculate total change + total_diff <- num_diff + cat_diff + + # Prove convergence + if (total_diff < eps) { + no_change_counter <- no_change_counter + 1 + if (no_change_counter >= 2) { + if(verbose){ + message("Convergence achieved after two consecutive iterations without changes") + print(paste("stop after", i, "iterations")) + } + + if (is.factor(data_temp[[target_col]])) { + + if (method_var == "ranger") { + mod <- "classif.ranger" + } + + if (method_var == "xgboost") { + mod <- "classif.xgboost" + } + + if (method_var == "regularized") { + mod <- "classif.glmnet" + } + + if (method_var == "robust") { + mod <- "classif.glm_rob" + } + + learner$model[[mod]]$param_set$values$predict_type <- "prob" + pred_probs <- learner$predict(pred_task)$prob + + formatted_output <- capture.output(print(pred_probs)) + + if (is.null(pred_probs)) { + stop("Error in the calculation of prediction probabilities.") + } + + preds <- apply(pred_probs, 1, function(probs) { + sample(levels(data_temp[[target_col]]), size = 1, prob = probs) # at last iteration: stochastic class assignment + }) + } + + break + } + } else { + no_change_counter <- 0 + } + } else { + no_change_counter <- 0 + } + } + ### Stop criteria END ### + + result <- as.data.table(if (imp_var) data_new else data) # Default: Return `data` only + result <- enforce_factor_levels(result, factor_levels) + + if (!pred_history && !tune) { + return(result) + } + + output <- list(data = result) + + if (pred_history) { + pred_result <- rbindlist(history, fill = TRUE) + output$pred_history <- pred_result + } + if (tune) { + output$tuning_log <- tuning_log + + } + return(output) +} diff --git a/inst/doc/supp.pdf b/inst/doc/supp.pdf deleted file mode 100644 index e4ec6d71..00000000 Binary files a/inst/doc/supp.pdf and /dev/null differ diff --git a/inst/tinytest/test_vimpute.R b/inst/tinytest/test_vimpute.R new file mode 100644 index 00000000..4f2b4008 --- /dev/null +++ b/inst/tinytest/test_vimpute.R @@ -0,0 +1,55 @@ +library(VIM) + +x <- vimpute(sleep, method="ranger", sequential = FALSE, imp_var=TRUE) +# xm <- vimpute(sleep, method="ranger", sequential = FALSE, imp_var=TRUE, median = TRUE) +# +# y <- vimpute(sleep, method="ranger", sequential = TRUE, imp_var=TRUE) +# z <- vimpute(sleep, method="ranger", sequential = TRUE, imp_var=TRUE, +# imputation_uncertainty ="PMM_1") +# z <- vimpute(sleep, method="ranger", sequential = FALSE, imp_var=TRUE, +# imputation_uncertainty ="PMM_5") +# +# +# a <- vimpute(sleep, method="xgboost", sequential = FALSE, imp_var=TRUE) +# b <- vimpute(sleep, method="xgboost", sequential = TRUE, imp_var=TRUE) +# c <- vimpute(sleep, method="xgboost", sequential = TRUE, imp_var=TRUE, +# imputation_uncertainty ="PMM_1") +# +# +# a <- vimpute(sleep, method="regression", sequential = FALSE, imp_var=TRUE) +# b <- vimpute(sleep, method="regression", sequential = TRUE, imp_var=TRUE) +# c <- vimpute(sleep, method="regression", sequential = TRUE, imp_var=TRUE, +# imputation_uncertainty ="PMM_3") +# +# a <- vimpute(sleep, method="robust", sequential = FALSE, imp_var=TRUE) +# b <- vimpute(sleep, method="robust", sequential = TRUE, imp_var=TRUE) +# c <- vimpute(sleep, method="robust", sequential = TRUE, imp_var=TRUE, +# imputation_uncertainty ="PMM_3") +# +# +# a <- vimpute(sleep, method="robust", sequential = FALSE, imp_var=TRUE) +# b <- vimpute(sleep, method="robust", sequential = TRUE, imp_var=TRUE) +# c <- vimpute(sleep, method="robust", sequential = TRUE, imp_var=TRUE, +# imputation_uncertainty ="PMM_3", model_uncertainty = "bootstrap", nboot = 50) +# +# df <- iris +# colnames(df) <- c("S.Length","S.Width","P.Length","P.Width","Species") +# # randomly produce some missing values in the data +# set.seed(1) +# nbr_missing <- 50 +# y <- data.frame(row = sample(nrow(iris), size = nbr_missing, replace = TRUE), +# col = sample(ncol(iris), size = nbr_missing, replace = TRUE)) +# y<-y[!duplicated(y), ] +# df[as.matrix(y)] <- NA +# +# test8 <- vimpute(data=df, formula = list(S.Length=~S.Width+P.Length), variable = c("S.Length"), +# method="regression", imp_var=FALSE, sequential = FALSE, imp_suffix="imp") +# test9 <- vimpute(data=df, formula = list(S.Length=~S.Width+P.Length), variable = c("S.Length"), imputation_uncertainty = "PMM_3", +# method="regression", imp_var=FALSE, sequential = FALSE, imp_suffix="imp") +# test10 <- vimpute(data=df, formula = list(S.Length=~S.Width+P.Length), variable = c("S.Length"), +# method="regression", imp_var=FALSE, nseq=10, sequential = TRUE, imp_suffix="imp") +# test11 <- vimpute(data=df, formula = list(S.Length=~S.Width+P.Length), variable = c("S.Length"), imputation_uncertainty = "PMM_3", +# method="regression", imp_var=FALSE, nseq=10, sequential = TRUE, imp_suffix="imp") +# test11 <- vimpute(data=df, formula = list(S.Length=~S.Width+P.Length), variable = c("S.Length"), imputation_uncertainty = "PMM_3", +# method="ranger", imp_var=FALSE, nseq=10, sequential = TRUE, imp_suffix="imp") +# diff --git a/man/VIM-package.Rd b/man/VIM-package.Rd index f054a893..1fa69057 100644 --- a/man/VIM-package.Rd +++ b/man/VIM-package.Rd @@ -2,8 +2,8 @@ % Please edit documentation in R/VIM-package.R \docType{package} \name{VIM-package} -\alias{VIM-package} \alias{VIM} +\alias{VIM-package} \title{Visualization and Imputation of Missing Values} \description{ This package introduces new tools for the visualization of missing or @@ -36,10 +36,32 @@ and Classification}, Online first. DOI: 10.1007/s11634-011-0102-y. M. Templ, A. Kowarik, P. Filzmoser (2011) Iterative stepwise regression imputation using standard and robust methods. \emph{Journal of Computational Statistics and Data Analysis}, Vol. 55, pp. 2793-2806. +} +\seealso{ +Useful links: +\itemize{ + \item \url{https://github.com/statistikat/VIM} +} + } \author{ -Matthias Templ, Andreas Alfons, Alexander Kowarik, Bernd Prantner +\strong{Maintainer}: Matthias Templ \email{matthias.templ@gmail.com} + +Authors: +\itemize{ + \item Alexander Kowarik \email{alexander.kowarik@statistik.gv.at} (\href{https://orcid.org/0000-0001-8598-4130}{ORCID}) + \item Andreas Alfons + \item Johannes Gussenbauer + \item Nina Niederhametner + \item Eileen Vattheuer + \item Gregor de Cillia \email{gregor.decillia@statistik.gv.at} + \item Wolfgang Rannetbauer +} + +Other contributors: +\itemize{ + \item Bernd Prantner [contributor] +} -Maintainer: Matthias Templ \href{mailto:templ@tuwien.ac.at}{templ@tuwien.ac.at} } -\keyword{package} +\keyword{internal} diff --git a/man/hotdeck.Rd b/man/hotdeck.Rd index c73cc5cc..cd5d83cd 100644 --- a/man/hotdeck.Rd +++ b/man/hotdeck.Rd @@ -101,6 +101,7 @@ Other imputation methods: \code{\link{rangerImpute}()}, \code{\link{regressionImp}()}, \code{\link{sampleCat}()}, +\code{\link{vimpute}()}, \code{\link{xgboostImpute}()} } \author{ diff --git a/man/impPCA.Rd b/man/impPCA.Rd index d946c3e7..8eb0020a 100644 --- a/man/impPCA.Rd +++ b/man/impPCA.Rd @@ -92,6 +92,7 @@ Other imputation methods: \code{\link{rangerImpute}()}, \code{\link{regressionImp}()}, \code{\link{sampleCat}()}, +\code{\link{vimpute}()}, \code{\link{xgboostImpute}()} } \author{ diff --git a/man/irmi.Rd b/man/irmi.Rd index 2c7d3e0a..8d57a054 100644 --- a/man/irmi.Rd +++ b/man/irmi.Rd @@ -161,6 +161,7 @@ Other imputation methods: \code{\link{rangerImpute}()}, \code{\link{regressionImp}()}, \code{\link{sampleCat}()}, +\code{\link{vimpute}()}, \code{\link{xgboostImpute}()} } \author{ diff --git a/man/kNN.Rd b/man/kNN.Rd index 18b10bd9..83b2fac3 100644 --- a/man/kNN.Rd +++ b/man/kNN.Rd @@ -121,6 +121,7 @@ Other imputation methods: \code{\link{rangerImpute}()}, \code{\link{regressionImp}()}, \code{\link{sampleCat}()}, +\code{\link{vimpute}()}, \code{\link{xgboostImpute}()} } \author{ diff --git a/man/matchImpute.Rd b/man/matchImpute.Rd index 760e79ab..5dec9de1 100644 --- a/man/matchImpute.Rd +++ b/man/matchImpute.Rd @@ -60,6 +60,7 @@ Other imputation methods: \code{\link{rangerImpute}()}, \code{\link{regressionImp}()}, \code{\link{sampleCat}()}, +\code{\link{vimpute}()}, \code{\link{xgboostImpute}()} } \author{ diff --git a/man/medianSamp.Rd b/man/medianSamp.Rd index b3598a6b..3e9f7d38 100644 --- a/man/medianSamp.Rd +++ b/man/medianSamp.Rd @@ -25,6 +25,7 @@ Other imputation methods: \code{\link{rangerImpute}()}, \code{\link{regressionImp}()}, \code{\link{sampleCat}()}, +\code{\link{vimpute}()}, \code{\link{xgboostImpute}()} } \concept{imputation methods} diff --git a/man/rangerImpute.Rd b/man/rangerImpute.Rd index 3feafdc9..ea49a00b 100644 --- a/man/rangerImpute.Rd +++ b/man/rangerImpute.Rd @@ -53,6 +53,7 @@ Other imputation methods: \code{\link{medianSamp}()}, \code{\link{regressionImp}()}, \code{\link{sampleCat}()}, +\code{\link{vimpute}()}, \code{\link{xgboostImpute}()} } \concept{imputation methods} diff --git a/man/regressionImp.Rd b/man/regressionImp.Rd index 85e949a7..a205840b 100644 --- a/man/regressionImp.Rd +++ b/man/regressionImp.Rd @@ -41,7 +41,7 @@ Impute missing values based on a regression model. } \details{ \code{\link[=lm]{lm()}} is used for family "normal" and \code{\link[=glm]{glm()}} for all other families. -(robust=TRUE: \code{\link[=lmrob]{lmrob()}}, \code{\link[=glmrob]{glmrob()}}) +(robust=TRUE: \code{\link[robustbase:lmrob]{robustbase::lmrob()}}, \code{\link[robustbase:glmrob]{robustbase::glmrob()}}) } \examples{ @@ -69,6 +69,7 @@ Other imputation methods: \code{\link{medianSamp}()}, \code{\link{rangerImpute}()}, \code{\link{sampleCat}()}, +\code{\link{vimpute}()}, \code{\link{xgboostImpute}()} } \author{ diff --git a/man/sampleCat.Rd b/man/sampleCat.Rd index 9a6614d9..b9b071af 100644 --- a/man/sampleCat.Rd +++ b/man/sampleCat.Rd @@ -25,6 +25,7 @@ Other imputation methods: \code{\link{medianSamp}()}, \code{\link{rangerImpute}()}, \code{\link{regressionImp}()}, +\code{\link{vimpute}()}, \code{\link{xgboostImpute}()} } \concept{imputation methods} diff --git a/man/vimpute.Rd b/man/vimpute.Rd new file mode 100644 index 00000000..7020f4a5 --- /dev/null +++ b/man/vimpute.Rd @@ -0,0 +1,101 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/vimpute.R +\name{vimpute} +\alias{vimpute} +\title{Impute missing values with prefered Model, sequentially, with hyperparametertuning and with PMM (if wanted) +Need of 'helper_vimpute' script} +\usage{ +vimpute( + data, + considered_variables = names(data), + method = setNames(as.list(rep("ranger", length(considered_variables))), + considered_variables), + pmm = setNames(as.list(rep(TRUE, length(considered_variables))), considered_variables), + formula = FALSE, + sequential = TRUE, + nseq = 10, + eps = 0.005, + imp_var = TRUE, + pred_history = FALSE, + tune = FALSE, + verbose = FALSE +) +} +\arguments{ +\item{data}{\itemize{ +\item Dataset with missing values. Can be provided as a data.table or data.frame. +}} + +\item{considered_variables}{\itemize{ +\item A character vector of variable names to be either imputed or used as predictors, excluding irrelevant columns from the imputation process. +}} + +\item{method}{\itemize{ +\item A named list specifying the imputation method for each variable: +}} + +\item{pmm}{\itemize{ +\item TRUE/FALSE indicating whether predictive mean matching is used. Provide as a list for each variable. +}} + +\item{formula}{\itemize{ +\item If not all variables are used as predictors, or if transformations or interactions are required (applies to all X, for Y only transformations are possible). Only applicable for the methods "robust" and "regularized". Provide as a list for each variable that requires specific conditions. +}} + +\item{sequential}{\itemize{ +\item If TRUE, all variables are imputed sequentially. +}} + +\item{nseq}{\itemize{ +\item Maximum number of iterations (if sequential is TRUE). +}} + +\item{eps}{\itemize{ +\item Threshold for convergence. +}} + +\item{imp_var}{\itemize{ +\item If TRUE, the imputed values are stored. +}} + +\item{pred_history}{\itemize{ +\item If TRUE, all predicted values across all iterations are stored. +}} + +\item{tune}{\itemize{ +\item Tunes hyperparameters halfway through iterations, TRUE or FALSE. +}} + +\item{verbose}{\itemize{ +\item If TRUE additional debugging output is provided +}} +} +\value{ +imputed data set or c(imputed data set, prediction history) +} +\description{ +Impute missing values with prefered Model, sequentially, with hyperparametertuning and with PMM (if wanted) +Need of 'helper_vimpute' script +} +\examples{ +\dontrun{ +x <- vimpute(data = sleep, sequential = FALSE) +y <- vimpute(data = sleep, sequential = TRUE, nseq = 3) +z <- vimpute(data = sleep, considered_variables = + c("Sleep", "Dream", "Span", "BodyWgt"), sequential = FALSE) +} +} +\seealso{ +Other imputation methods: +\code{\link{hotdeck}()}, +\code{\link{impPCA}()}, +\code{\link{irmi}()}, +\code{\link{kNN}()}, +\code{\link{matchImpute}()}, +\code{\link{medianSamp}()}, +\code{\link{rangerImpute}()}, +\code{\link{regressionImp}()}, +\code{\link{sampleCat}()}, +\code{\link{xgboostImpute}()} +} +\concept{imputation methods} diff --git a/man/xgboostImpute.Rd b/man/xgboostImpute.Rd index 9e0a9c96..a9105366 100644 --- a/man/xgboostImpute.Rd +++ b/man/xgboostImpute.Rd @@ -64,6 +64,7 @@ Other imputation methods: \code{\link{medianSamp}()}, \code{\link{rangerImpute}()}, \code{\link{regressionImp}()}, -\code{\link{sampleCat}()} +\code{\link{sampleCat}()}, +\code{\link{vimpute}()} } \concept{imputation methods} diff --git a/tests/test_imputeRobust.R b/tests/test_imputeRobust.R deleted file mode 100644 index 3abfe874..00000000 --- a/tests/test_imputeRobust.R +++ /dev/null @@ -1,308 +0,0 @@ -# library(VIM) -# library(laeken) -# library(robustbase) -# library(mice) -# data("Animals_na") -# -# source("~/workspace/VIM/R/imputeRobust.R") -# -# # debugonce(imputeRobust) -# imputeRobust(data = Animals_na, -# form = formula("lbrain ~ lbody"), -# method = "lm", uncert = "wresid", boot = FALSE) -# -# source("~/workspace/vimbook/R/ani_plot.R") -# -# data(Animals, package = "MASS") -# ani_plot(Animals = Animals, method = "imputeRobust-lts-pmm", mainlab = NULL, -# seed = 1) -# ani_plot(Animals = Animals, method = "imputeRobust-lts-pmm-chain", mainlab = NULL, -# seed = 1) -# -# -# ani_plot(Animals = Animals, method = "imputeRobust-lts-normal", mainlab = NULL, seed = 123) -# ani_plot(Animals = Animals, method = "imputeRobust-lts-resid", mainlab = NULL, seed = 123) -# ani_plot(Animals = Animals, method = "imputeRobust-lts-wresid", mainlab = NULL, seed = 123) -# ani_plot(Animals = Animals, method = "imputeRobust-lts-pmm-nboot", mainlab = NULL, seed = 123) -# ani_plot(Animals = Animals, method = "imputeRobust-lts-normal-nboot", mainlab = NULL, seed = 123) -# ani_plot(Animals = Animals, method = "imputeRobust-lts-resid-nboot", mainlab = NULL, seed = 123) -# ani_plot(Animals = Animals, method = "imputeRobust-lts-wresid-nboot", mainlab = NULL, seed = 123) -# ani_plot(Animals = Animals, method = "imputeRobust-lts-pmm-noboot", mainlab = NULL, seed = 123) -# ani_plot(Animals = Animals, method = "imputeRobust-lts-normal-noboot", mainlab = NULL, seed = 123) -# ani_plot(Animals = Animals, method = "imputeRobust-lts-resid-noboot", mainlab = NULL, seed = 123) -# ani_plot(Animals = Animals, method = "imputeRobust-lts-wresid-noboot", mainlab = NULL, seed = 123) -# -# -# ani_plot(Animals = Animals, method = "imputeRobust-gam-pmm", mainlab = NULL, seed = 123) -# ani_plot(Animals = Animals, method = "imputeRobust-gam-normal", mainlab = NULL, seed = 123) -# ani_plot(Animals = Animals, method = "imputeRobust-gam-resid", mainlab = NULL, seed = 123) -# ani_plot(Animals = Animals, method = "imputeRobust-gam-wresid", mainlab = NULL, seed = 123) -# -# ani_plot(Animals = Animals, method = "imputeRobust-lm-pmm", mainlab = NULL, seed = 123) -# ani_plot(Animals = Animals, method = "imputeRobust-lm-normal", mainlab = NULL, seed = 123) -# ani_plot(Animals = Animals, method = "imputeRobust-lm-resid", mainlab = NULL, seed = 123) -# ani_plot(Animals = Animals, method = "imputeRobust-lm-wresid", mainlab = NULL, seed = 123) -# -# ani_plot(Animals = Animals, method = "imputeRobust-MM-pmm", mainlab = NULL, seed = 123) -# ani_plot(Animals = Animals, method = "imputeRobust-MM-normal", mainlab = NULL, seed = 123) -# ani_plot(Animals = Animals, method = "imputeRobust-MM-resid", mainlab = NULL, seed = 123) -# ani_plot(Animals = Animals, method = "imputeRobust-MM-wresid", mainlab = NULL, seed = 123) -# -# debugonce("imputeRobustChain") -# -# Animals_na$lbody[1:3] <- NA -# xi <- imputeRobustChain(formulas = list(formula("lbody ~ lbrain"), -# formula("lbrain ~ lbody")), -# data = Animals_na, -# boot = TRUE, -# robustboot = TRUE, -# method = "lts", -# takeAll = TRUE, -# eps = 0.5, -# maxit = 4, -# alpha = 0.5, -# uncert = "pmm", -# familiy = "Gaussian", -# value_back = "matrix", -# trace = TRUE) -# -# data(iris) -# iris[1:3, 1] <- iris[4:6, 2] <- iris[5:7, 3] <- iris[7:9, 4] <- NA -# iris[9:11, 5] <- NA -# debugonce("imputeRobustChain") -# xi <- imputeRobustChain(formulas = list(formula("Sepal.Length ~ ."), -# NULL, NULL, NULL, formula("Species ~ .")), -# data = iris, -# boot = TRUE, -# robustboot = TRUE, -# method = "lts", -# multinom.method = "multinom", -# takeAll = TRUE, -# eps = 0.5, -# maxit = 4, -# alpha = 0.5, -# uncert = "pmm", -# familiy = "Gaussian", -# value_back = "matrix", -# trace = TRUE) -# -# -# -# ## blasting -# -# library(stmo) -# data(blasting) -# head(blasting) -# -# lm_blasting1 <- lm(log(vibration) ~ log(distance) + log(load), -# data = blasting) -# summary(lm_blasting1) -# lm_blasting2 <- lm(log10(vibration) ~ log10(distance) * location + load, -# data = blasting) -# summary(lm_blasting2) -# -# blasting$vibration[sample(1:nrow(blasting), 7)] <- NA -# w <- complete.cases(blasting) -# -# forms <- list(NULL, NULL, NULL, formula("log10(vibration) ~ log10(distance) * location + load")) -# debugonce("imputeRobustChain") -# xi <- suppressWarnings(imputeRobustChain(formulas = forms, data = blasting, trace = TRUE, maxit = 1, uncert = "resid")) -# xi2 <- suppressWarnings(imputeRobustChain(data = blasting, trace = TRUE, maxit = 20)) -# -# plot(xi[, c("vibration", "distance")], col = w + 3) -# plot(xi2[, c("vibration", "distance")], col = w + 3) -# -# library(mice) -# x2 <- complete(mice(blasting, m = 1)) -# -# plot(x2[, c("vibration", "distance")], col = w + 3) -# -# -# f <- formula("log10(vibration) ~ log10(distance) * location + load") -# fn <- f[2] -# cn <- "vibration" -# -# data("cherry") -# cherry.lm <- lm(log(volume) ~ log(diameter) + log(height), data = cherry) -# -# data("leafburn", package = "faraway") -# pairs(leafburn) -# # log10(burntimei) = β0 + β1nitrogeni + β2log10(chlorine)i + β3potassiumi + -# -# data(wood, package = "robustbase") -# ?wood -# -# setMissOut <- function(x, vars = 1:ncol(x), r = rep(0.1, ncol(x))){ -# # outlier detection -# o <- robCompositions::outCoDa(x, coda = FALSE, quantile = 0.99) -# n_outliers <- sum(o$outlierIndex) -# n_nonoutliers <- sum(!o$outlierIndex) -# # sort data -# x_na <- x_orig <- rbind(x[!o$outlierIndex, ], x[o$outlierIndex, ]) -# # set missings in all variables -# # but not in outliers -# n <- nrow(x) -# n_good <- sum(!o$outlierIndex) -# n_out <- sum(o$outlierIndex) -# for(i in vars){ -# s <- sample(1:n_good, round(r[i] * n), replace = FALSE) -# x_na[s, i] <- NA -# } -# m1 <- is.na(x_na[, vars]) -# return(x_na) -# } -# -# wood_na <- setMissOut(wood) -# xi <- imputeRobustChain(data = wood_na) -# -# -# -# setMiss <- function(x, kind = "MCAR", rate = 0.3){ -# # simple function assuming x has variables Y and X -# SEQ <- 1:nrow(x) -# n <- nrow(x) -# p <- ncol(x) -# if(kind == "MCAR"){ -# s <- sample(SEQ, size = round(rate * n, 0)) -# } -# if(kind == "MAR"){ -# s <- sample(SEQ, size = round(rate * n, 0), prob = (x$X+abs(min(x$X))+1)/sum(x$X+abs(min(x$X))+1)) -# } -# if(kind == "MNAR"){ -# s <- sample(SEQ, size = round(rate * n, 0), prob = (x$Y+abs(min(x$Y))+1)/sum(x$Y+abs(min(x$Y))+1)) -# } -# x$Y[s] <- NA -# return(x) -# } -# - -# imputeRobust(x = Animals_na, -# formulas = list(formula("lbody ~ lbrain"), -# formula("lbrain ~ lbody"))) -# - -# head(Animals_na) -# aggr(Animals_na) -# First algorithm (fit robust and draw robust bootrap samples, -# then classical fit) - -# -# bootfunc.robust <- function (n) -# { -# random <- sample.int(n, replace = TRUE) -# as.numeric(table(factor(random, levels = seq_len(n)))) -# } -# -# -# imputeRobust <- function(x, -# formulas = NULL, -# eps = 0.1, -# maxit = 5, -# mi = 1){ -# -# # Missing index M -# M <- is.na(x) -# n <- nrow(x) -# p <- ncol(x) -# -# # colswithNA -# colswithNA <- apply(x, 2, function(x){any(is.na(x))}) -# -# # Initialize -# x <- kNN(x, imp_var = FALSE) -# -# -# -# for(i in 1:p){ -# if(colswithNA[i]){ -# if(!is.null(formulas[[i]])){ -# mf <- model.frame(formulas[[i]], data = x) -# y <- model.extract(mf, "response") -# y_name <- all.vars(formulas[[i]])[1] -# X_names <- all.vars(formulas[[i]])[-1] -# # robust fit -# mod <- ltsReg(formulas[[1]], data = x) -# # robust bootstrap sample -# bs <- sample(x = mod$best, size = n, replace = TRUE) -# # Fit a OLS regression -# mod2 <- lm(formulas[[i]], data = x[bs, ]) -# # Predict/Update missings including imputation uncertainty -# x[M[, i], y_name] <- predict(mod2, -# newdata = x[M[, i], X_names, drop = FALSE]) -# } -# -# } -# } -# return(x) -# } -# -# debugonce(imputeRobust) -# imputeRobust(x = Animals_na, -# formulas = list(formula("lbody ~ lbrain"), -# formula("lbrain ~ lbody"))) - -# -# -# # 1. calulate M -# # M <- is.na(data[, c(y_name, X_names)]) -# # calculate m -# m <- is.na(y) -# n <- nrow(Animals_na) -# -# # 2. initialize with knn -# x <- kNN(data[, c(y_name, X_names], imp_var = FALSE)]) -# -# # 3. LTS-regression -# mod <- ltsReg(lbrain ~ lbody, data = Animals_na) -# -# # 4. Draw a bootstrap sample of size n -# # from non-outliers -# bs <- sample(x = mod$best, size = n, replace = TRUE) -# -# # 5. Fit a OLS regression -# mod2 <- lm(lbrain ~ lbody, data = Animals[bs, ]) -# -# # 6. Predict/Update missings including imputation uncertainty -# Animals[M, "lbrain"] <- predict(mod2, -# newdata = data.frame("lbody" = 1)) -# -# } -# -# x <- Animals_na -# formulas <- list(formula("lbrain ~ lbody", data = x), -# formula("lbody ~ lbrain", data = x)) -# -# -# form <- formula(lbrain ~ lbody, data = Animals_na) -# mf <- model.frame(form, data = Animals_na) -# debugonce(imputeRobust) -# imputeRobust(lbrain ~ lbody, data = Animals_na) -# -# -# # 1. calulate M -# M <- is.na(Animals_na$lbrain) -# n <- nrow(Animals_na) -# -# # 2. initialize with knn -# Animals <- kNN(Animals_na, imp_var = FALSE) -# -# # 3. LTS-regression -# mod <- ltsReg(lbrain ~ lbody, data = Animals_na) -# -# # 4. Draw a bootstrap sample of size n -# # from non-outliers -# bs <- sample(x = mod$best, size = n, replace = TRUE) -# -# # 5. Fit a OLS regression -# mod2 <- lm(lbrain ~ lbody, data = Animals[bs, ]) -# -# # 6. Predict/Update missings including imputation uncertainty -# Animals[M, "lbrain"] <- predict(mod2, -# newdata = data.frame("lbody" = 1)) -# -# } -# -# # Second algorithm (draw calibrated bootstrap) -# -# -# # Third algorithm (draw bootrap sample and fit robust) \ No newline at end of file diff --git a/vignettes/VIM.Rmd b/vignettes/VIM.Rmd index 444aaf95..c417e4bb 100644 --- a/vignettes/VIM.Rmd +++ b/vignettes/VIM.Rmd @@ -70,6 +70,8 @@ available - `vignette("modelImp")` gives insight into the model-based imputation methods `regressionImp()` and `matchImpute()` - `vignette("irmi")` showcases the `irmi()` method. +- `vignette("vimpute")` showcases the newly added `vimpute()` function that performs imputation with +mlr3 as backend. ## Visualize imputed values diff --git a/vignettes/VisualImp.Rmd b/vignettes/VisualImp.Rmd index d7670aec..14dd1819 100644 --- a/vignettes/VisualImp.Rmd +++ b/vignettes/VisualImp.Rmd @@ -357,4 +357,4 @@ The function `alphablend()`, can be used to convert colors to semitransparent co ```{r} alphablend("red", 0.6) -``` +``` \ No newline at end of file diff --git a/vignettes/vimpute.Rmd b/vignettes/vimpute.Rmd new file mode 100644 index 00000000..f8507c53 --- /dev/null +++ b/vignettes/vimpute.Rmd @@ -0,0 +1,522 @@ +--- +title: "Imputation Method vimpute" +author: "Eileen Vattheuer" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Imputation Method vimpute} + %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} +--- + +```{r, include=FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.width = 7, + fig.height = 5, + fig.align = "center" +) +``` + +## Introduction + +This vignette demonstrates how to use the `vimpute()` function for flexible missing data imputation using machine learning models from the mlr3 ecosystem. + +### Function Arguments + +- `data`: A datatable or dataframe containing missing values to be imputed. +- `considered_variables`: A character vector of variable names to be either imputed or used as predictors, excluding irrelevant columns from the imputation process. +- `method`: A named list specifying the imputation method for each variable. +- `pmm`: TRUE/FALSE indicating whether predictive mean matching is used. Provide as a list for each variable. +- `formula`: If not all variables are used as predictors, or if transformations or interactions are required + (applies to all X, for Y only transformations are possible). Only applicable for the methods "robust" andc "regularized". Provide as a list for each variable that requires specific conditions. +- `sequential`: Specifies whether the imputation should be performed sequentially. +- `nseq`: The number of sequential iterations, if `sequential`is TRUE. +- `eps`: The convergence threshold for sequential imputation. +- `imp_var`: Specifies whether to add indicator variables for imputed values. +- `pred_history`: If enabled, saves the prediction history. +- `tune`: Whether to perform hyperparameter tuning. + +## Data + +To demonstrate the function, the `sleep` dataset from the `VIM` package is used. + +```{r, echo = FALSE, results='hide', message=FALSE, warning=FALSE} +library(VIM) +library(data.table) +``` +```{r setup_2, message = FALSE} +data <- as.data.table(VIM::sleep) +a <- aggr(sleep, plot = FALSE) +plot(a, numbers = TRUE, prop = FALSE) +``` + +The left plot shows the amount of missings for each column in the dataset sleep and the right plot shows how often each combination of missings occur. For example, there are 9 rows wich contain a missing in both NonD and Dream. + +```{r, message = FALSE} +dataDS <- sleep[, c("Dream", "Sleep")] +marginplot(dataDS, main = "Missing Values") +``` + +The __red__ boxplot on the left shows the distrubution of all values of Sleep where Dream contains a missing value. +The __blue__ boxplot on the left shows the distribution of the values of Sleep where Dream is observed. + +## Basic Usage + +### Default Imputation + +In the basic usage, the `vimpute()` function performs imputation using the default settings. It uses the "ranger" method for all variables, applies predictive mean matching, and performs sequential imputation with a convergence threshold of 0.005. + +```{r, include=TRUE, results='hide', message=FALSE, warning=FALSE} +result <- vimpute( + data = data, + pred_history = TRUE) +``` +```{r} +print(head(result$data, 3)) +``` + +Results and information about missing/imputed values can be shown in the plot margins: + +```{r} +dataDS <- as.data.frame(result$data[, c("Dream", "Sleep", "Dream_imp", "Sleep_imp")]) +marginplot(dataDS, delimiter = "_imp", main = "Imputation with Default Model") +``` + +The default output are the imputed dataset and the prediction history. + +In this plot three differnt colors are used in the top-right. +These colors represent the structure of missings. + +* __brown__ points represent values where `Dream` was missing initially +* __beige__ points represent values where `Sleep` was missing initially +* __black__ points represent values where both `Dream` and `Sleep` were missing + initially + +## Advanced Options + +#### Parameter `method` +*(default: "ranger" for all variables)* + +Specifies the machine learning method used for imputation of each variable. In this example, different imputation methods are specified for each variable. The `NonD` variable uses a robust method, `Dream` and `Span` are using ranger, `Sleep` uses xgboost, `Gest` uses a regularized method and `class` uses a robust method. + +- **`"robust"`**: Robust regression models + - `lmrob` for numeric variables: Implements MM-estimation for resistance to outliers + - `glmrob` for binary factors: Uses robust estimators to reduce outlier influence +- **`"regularized"`**: Regularized regression (`glmnet`) + - Uses elastic net regularization + - Automatically handles multicollinearity +- **`"ranger"`**: Random Forest + - Fast implementation of random forests + - Handles non-linear relationships well +- **`"xgboost"`**: Gradient Boosted Trees + - State-of-the-art tree boosting + - Handles mixed data types well + +```{r, include=TRUE, results='hide', message=FALSE, warning=FALSE} +result_mixed <- vimpute( + data = data, + method = list(NonD = "robust", Dream = "ranger", Sleep = "xgboost", Span = "ranger" , Gest = "regularized"), + pred_history = TRUE + ) +``` +```{r} +dataDS <- as.data.frame(result_mixed$data[, c("Dream", "Sleep", "Dream_imp", "Sleep_imp")]) +marginplot(dataDS, delimiter = "_imp", main = "Imputation with different Models for each Variable") +``` +```{r, include=TRUE, results='hide', message=FALSE, warning=FALSE, echo=F,} +result_xgboost <- vimpute( + data = data, + method = setNames(as.list(rep("xgboost", ncol(data))), names(data)), + pred_history = TRUE, verbose = FALSE + ) +dataDS_xgboost <- as.data.frame(result_xgboost$data[, c("Dream", "Sleep", "Dream_imp", "Sleep_imp")]) +result_regularized <- vimpute( + data = data, + method = setNames(as.list(rep("regularized", ncol(data))), names(data)), + pred_history = TRUE + ) +dataDS_regularized <- as.data.frame(result_regularized$data[, c("Dream", "Sleep", "Dream_imp", "Sleep_imp")]) +``` + +The side-by-side margin plots compare the performance of two imputation methods: xgboost (left) and regularized (right): + +```{r, echo=F, warning=F} +par(mfrow = c(1, 2)) +marginplot(dataDS_xgboost, delimiter = "_imp", main = "Imputation with xgboost") +marginplot(dataDS_regularized, delimiter = "_imp", main = "Imputation with Regularized") +par(mfrow = c(1, 1)) +``` + +xgboost handles missing values with data-driven, uneven imputations that capture complex patterns but may be less stable, while regularized methods produce smoother, more conservative estimates that are less prone to overfitting. The key difference lies in flexibility (xgboost) versus robustness (regularization). + +#### Parameter `pmm` +*(default: TRUE for all numeric variables)* + +```{r, eval = FALSE} +result <- vimpute( + data = data, + method = list(NonD = "robust", + Dream = "ranger", + Sleep = "xgboost", + Span = "ranger" , + Gest = "regularized"), + pmm = list(NonD = FALSE, Dream = TRUE, Sleep = FALSE, Span = FALSE , Gest = TRUE) + ) +``` + +If TRUE, imputed values are restricted to actual observed values in the dataset, ensuring realism but potentially limiting variability. +If FALSE, raw model predictions are used, allowing greater flexibility but risking implausible or extreme imputations. + + +#### Parameter `formula` +*(default: FALSE)* + +Specifies custom model formulas for imputation of each variable, offering precise control over the imputation models. + +**Key Features:** + +1. **Variable-Specific Models** + - Each formula specifies which predictors should be used for imputing a particular variable + - Enables different predictor sets for different target variables + - Example: + ```r + formula = list( + income ~ education + age, + blood_pressure ~ weight + age + ) + ``` + +2. **Transformations Support** + - Handles common transformations on both sides of the formula: + - Response transformations: `log(y)`, `sqrt(y)`, `exp(y)`, `I(1/y)` + - Predictor transformations: `log(x1)`, `poly(x2, 2)`, etc. + - Example with transformations: + ```r + formula = list( + log(income) ~ poly(age, 2) + education, + sqrt(blood_pressure) ~ weight + I(1/age) + ) + ``` + +3. **Interaction Terms** + - Supports interaction terms using `:` or `*` syntax (on the right side) + - Example: + ```r + formula = list( + price ~ sqft * neighborhood + year_built + ) + ``` + +**Example Demonstration:** + +```{r, eval = FALSE} +result <- vimpute( + data = data, + method = setNames(as.list(rep("regularized", ncol(data))), names(data)) + formula = list( + NonD ~ Dream + Sleep, # Linear combination + Span ~ Dream:Sleep + Gest, # With interaction term + log(Gest) ~ Sleep + exp(Span) # With transformations + ) +) +``` + +**Interpreting the Example:** + +1. For `NonD`: + - Uses linear combination of `Dream` and `Sleep` variables + - Model: `NonD = β₀ + β₁*Dream + β₂*Sleep + ε` + +2. For `Span`: + - Includes interaction between `Dream` and `Sleep` + - Plus main effect of `Gest` + - Model: `Span = β₀ + β₁*Dream*Sleep + β₂*Gest + ε` + +3. For `Gest`: + - Uses log-transformed response + - Predictors include `Sleep` and exponential of `Span` + - Model: `log(Gest) = β₀ + β₁*Sleep + β₂*exp(Span) + ε` + +4. For `Sleep` and `Dream` all other variables are used as predictors + +**Notes:** + +- Only works with methods `"robust"` and `"regularized"` +- All model.matrix-compatible functions work for predictors (for more information see ?model.matrix) +- Response transformations (left side) are automatically back-transformed + + +#### Parameter `tune` +*(default: FALSE)* + +```{r, eval = FALSE} +result <- vimpute( + data = data, + tune = TRUE + ) +``` + +Whether to perform hyperparameter tuning (only possible if seq = TRUE): + +- When TRUE: + - Conducts randomized parameter search (after half of the iterations) + - Uses best performing configuration + +- When FALSE: + - Uses default model parameters + - Recommended: TRUE for optimal performance + + +#### Parameters `nseq` and `eps` +*(default: 10 and default: 0.005)* + +```{r, eval = FALSE} +result <- vimpute( + data = data, + nseq = 20, + eps = 0.01 + ) +``` + +`nseq` describes the number of sequential imputation iterations. Higher values: + +- Allow more refinement +- Increase computation time + +`eps` describes the convergence threshold for sequential imputation: + +- Stops early if changes between iterations < eps +- Smaller values: Require more precise convergence but may need more iterations + + +#### Parameter `imp_var` +*(default: TRUE)* + +```{r, eval = FALSE} +result <- vimpute( + data = data, + imp_var = TRUE + ) +``` + +Creating indicator variables for imputed values adds "_imp" columns (TRUE/FALSE) to mark which data points were imputed. This is particularly useful for tracking imputation effects and conducting diagnostic analyses. + + +#### Parameter `pred_history` +*(default: TRUE)* + +```{r} +print(tail(result$pred_history, 9)) +``` + +When enabled (TRUE), this option saves prediction trajectories in `$pred_history`, allowing users to track how imputed values evolve across iterations. This feature is particularly useful for diagnosing convergence issues. + + +## Performance + +In order to validate the performance of vimpute() the iris dataset is used. Firstly, some values are randomly set to `NA`. +```{r} +library(reactable) + +data(iris) +df <- as.data.table(iris) +colnames(df) <- c("S.Length","S.Width","P.Length","P.Width","Species") +# randomly produce some missing values in the data +set.seed(1) +nbr_missing <- 50 +y <- data.frame(row=sample(nrow(iris),size = nbr_missing,replace = T), + col=sample(ncol(iris)-1,size = nbr_missing,replace = T)) +y<-y[!duplicated(y),] +df[as.matrix(y)]<-NA + +aggr(df) +``` +```{r} +sapply(df, function(x)sum(is.na(x))) +``` + +The data contains missing values across all variables, with some observations missing multiple values. The subsequent step involves variable imputation, and the following tables present the rounded first five imputation results for each variable. + +For default model: + +```{r, results='hide', message=FALSE, warning=FALSE,include=FALSE} +library(reactable) + +data(iris) +df <- as.data.table(iris) +colnames(df) <- c("S.Length","S.Width","P.Length","P.Width","Species") + +# Create complete copy before introducing NAs +complete_data <- df + +# Randomly produce missing values +set.seed(1) +nbr_missing <- 50 +y <- data.frame(row = sample(nrow(df), size = nbr_missing, replace = TRUE), + col = sample(ncol(df), size = nbr_missing, replace = TRUE)) +y <- y[!duplicated(y),] +df[as.matrix(y)] <- NA + +# Perform imputation +result <- vimpute(data = df, pred_history = TRUE) + +# Extracting the imputed columns from result$data +imputed_columns <- grep("_imp$", names(result$data), value = TRUE) + +# Create a function to compare true and imputed values +compare_values <- function(true_data, pred_data, imputed_data, col_name) { + comparison <- data.frame( + True_Value = true_data[[col_name]], + Imputed_Value = ifelse(imputed_data, pred_data[[col_name]], NA) + ) + comparison <- comparison[!is.na(comparison$Imputed_Value), ] + return(comparison) +} + +# Initialize an empty list to store the comparison tables +comparison_list <- list() + +# Loop through each imputed column and create a comparison table +for (imputed_col in imputed_columns) { + col_name <- sub("_imp$", "", imputed_col) + comparison_list[[col_name]] <- compare_values(complete_data, result$data, result$data[[imputed_col]], col_name) +} + +# Prepare the results in a combined wide format, ensuring equal row numbers +results <- cbind( + "TRUE1" = head(comparison_list[["S.Length"]][, "True_Value"], 5), + "IMPUTED1" = head(comparison_list[["S.Length"]][, "Imputed_Value"], 5), + "TRUE2" = head(comparison_list[["S.Width"]][, "True_Value"], 5), + "IMPUTED2" = head(comparison_list[["S.Width"]][, "Imputed_Value"], 5), + "TRUE3" = head(comparison_list[["P.Length"]][, "True_Value"], 5), + "IMPUTED3" = head(comparison_list[["P.Length"]][, "Imputed_Value"], 5), + "TRUE4" = head(comparison_list[["P.Width"]][, "True_Value"], 5), + "IMPUTED4" = head(comparison_list[["P.Width"]][, "Imputed_Value"], 5) +) + +# Print the combined wide format table +print(results) +``` +```{r echo=F,warning=F} +# Load the reactable library +library(reactable) + +# Create the reactable +reactable(results, columns = list( + TRUE1 = colDef(name = "True"), + IMPUTED1 = colDef(name = "Imputed"), + TRUE2 = colDef(name = "True"), + IMPUTED2 = colDef(name = "Imputed"), + TRUE3 = colDef(name = "True"), + IMPUTED3 = colDef(name = "Imputed"), + TRUE4 = colDef(name = "True"), + IMPUTED4 = colDef(name = "Imputed") +), +columnGroups = list( + colGroup(name = "S.Length", columns = c("TRUE1", "IMPUTED1")), + colGroup(name = "S.Width", columns = c("TRUE2", "IMPUTED2")), + colGroup(name = "P.Length", columns = c("TRUE3", "IMPUTED3")), + colGroup(name = "P.Width", columns = c("TRUE4", "IMPUTED4")) +), +striped = TRUE, +highlight = TRUE, +bordered = TRUE +) +``` + +For xgboost model: + +```{r, results='hide', message=FALSE, warning=FALSE,include=FALSE} +library(reactable) +library(VIM) +data(iris) + +# Create complete copy before introducing NAs +complete_data <- iris +colnames(complete_data) <- c("S.Length","S.Width","P.Length","P.Width","Species") +df <- copy(complete_data) + +# Randomly produce missing values +set.seed(1) +nbr_missing <- 50 +y <- data.frame(row = sample(nrow(df), size = nbr_missing, replace = TRUE), + col = sample(ncol(df), size = nbr_missing, replace = TRUE)) +y <- y[!duplicated(y),] +df[as.matrix(y)] <- NA + +# Perform imputation with proper method specification +result <- vimpute( + data = df, + method = setNames(lapply(names(df), function(x) "xgboost"),names(df)), + pred_history = TRUE +) + +# Extracting the imputed columns from result$data +imputed_columns <- grep("_imp$", names(result$data), value = TRUE) + +# Create a function to compare true and imputed values +compare_values <- function(true_data, pred_data, imputed_data, col_name) { + comparison <- data.frame( + True_Value = true_data[[col_name]], + Imputed_Value = ifelse(imputed_data, pred_data[[col_name]], NA) + ) + comparison <- comparison[!is.na(comparison$Imputed_Value), ] + return(comparison) +} + +# Initialize an empty list to store the comparison tables +comparison_list <- list() + +# Loop through each imputed column and create a comparison table +for (imputed_col in imputed_columns) { + col_name <- sub("_imp$", "", imputed_col) + comparison_list[[col_name]] <- compare_values(complete_data, result$data, result$data[[imputed_col]], col_name) +} + +# Prepare the results in a combined wide format, ensuring equal row numbers +results <- cbind( + "TRUE1" = head(comparison_list[["S.Length"]][, "True_Value"], 5), + "IMPUTED1" = head(comparison_list[["S.Length"]][, "Imputed_Value"], 5), + "TRUE2" = head(comparison_list[["S.Width"]][, "True_Value"], 5), + "IMPUTED2" = head(comparison_list[["S.Width"]][, "Imputed_Value"], 5), + "TRUE3" = head(comparison_list[["P.Length"]][, "True_Value"], 5), + "IMPUTED3" = head(comparison_list[["P.Length"]][, "Imputed_Value"], 5), + "TRUE4" = head(comparison_list[["P.Width"]][, "True_Value"], 5), + "IMPUTED4" = head(comparison_list[["P.Width"]][, "Imputed_Value"], 5) +) + +# Print the combined wide format table +print(results) +``` +```{r echo=F,warning=F} +# Load the reactable library +library(reactable) + +# Create the reactable +reactable(results, columns = list( + TRUE1 = colDef(name = "True"), + IMPUTED1 = colDef(name = "Imputed"), + TRUE2 = colDef(name = "True"), + IMPUTED2 = colDef(name = "Imputed"), + TRUE3 = colDef(name = "True"), + IMPUTED3 = colDef(name = "Imputed"), + TRUE4 = colDef(name = "True"), + IMPUTED4 = colDef(name = "Imputed") +), +columnGroups = list( + colGroup(name = "S.Length", columns = c("TRUE1", "IMPUTED1")), + colGroup(name = "S.Width", columns = c("TRUE2", "IMPUTED2")), + colGroup(name = "P.Length", columns = c("TRUE3", "IMPUTED3")), + colGroup(name = "P.Width", columns = c("TRUE4", "IMPUTED4")) +), +striped = TRUE, +highlight = TRUE, +bordered = TRUE +) +``` + + + + + diff --git a/vignettes/xgboostImpute.Rmd b/vignettes/xgboostImpute.Rmd index 56af29ae..26db5162 100644 --- a/vignettes/xgboostImpute.Rmd +++ b/vignettes/xgboostImpute.Rmd @@ -4,7 +4,7 @@ author: "Birgit Karlhuber" date: "2024-07-08" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Imputation Method based on Random Forest Model} + %\VignetteIndexEntry{Imputation Method based on xgboost} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} ---