statistikat · alexkowa · Feb 7, 2026 · Sep 22, 2025 · Dec 10, 2025 · Dec 18, 2025
diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
@@ -7,7 +7,9 @@ on:
 
 name: test-coverage.yaml
 
-permissions: read-all
+permissions:
+  contents: read
+  id-token: write
 
 jobs:
   test-coverage:
@@ -40,12 +42,11 @@ jobs:
 
       - uses: codecov/codecov-action@v5
         with:
-          # Fail if error if not on PR, or if on PR and token is given
-          fail_ci_if_error: ${{ github.event_name != 'pull_request' || secrets.CODECOV_TOKEN }}
+          fail_ci_if_error: true
           files: ./cobertura.xml
           plugins: noop
           disable_search: true
-          token: ${{ secrets.CODECOV_TOKEN }}
+          use_oidc: true
 
       - name: Show testthat output
         if: always()
@@ -59,4 +60,4 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           name: coverage-test-failures
-          path: ${{ runner.temp }}/package
+          path: ${{ runner.temp }}/package
diff --git a/R/xgboostImpute.R b/R/xgboostImpute.R
@@ -47,7 +47,6 @@ xgboostImpute <- function(formula, data, imp_var = TRUE,
     # formula without left side for prediction
     formPred <- as.formula(paste( "~", rhs,"-1"))
     lhs_vector <- data[[lhsV]]
-    num_class <- NULL
     if (!any(is.na(lhs_vector))) {
       cat(paste0("No missings in ", lhsV, ".\n"))
     } else {
@@ -59,24 +58,16 @@ xgboostImpute <- function(formula, data, imp_var = TRUE,
       currentClass <- NULL
       if(inherits(labtmp,"factor")){
         currentClass <- "factor"
-
-        predict_levels <- levels(labtmp)
-        labtmp <- as.integer(labtmp)-1
-        if(length(unique(labtmp))==2){
+        if(length(levels(labtmp))==2){
           objective <- "binary:logistic"
-          predict_levels <- predict_levels[unique(labtmp)+1]
-          labtmp <- as.integer(as.factor(labtmp))-1
-
         }else if(length(unique(labtmp))>2){
           objective <- "multi:softprob"
-          num_class <- max(labtmp)+1
         }
 
       }else if(inherits(labtmp,"integer")){
         currentClass <- "integer"
         if(length(unique(labtmp))==2){
-          lvlsInt <- unique(labtmp)
-          labtmp <- match(labtmp,lvlsInt)-1
+          labtmp <- as.factor(labtmp)
           warning("binary factor detected but not properly stored as factor.")
           objective <- "binary:logistic"
         }else{
@@ -85,8 +76,7 @@ xgboostImpute <- function(formula, data, imp_var = TRUE,
       }else if(inherits(labtmp,"numeric")){
         currentClass <- "numeric"
         if(length(unique(labtmp))==2){
-          lvlsInt <- unique(labtmp)
-          labtmp <- match(labtmp,lvlsInt)-1
+          labtmp <- as.factor(labtmp)
           warning("binary factor detected but not properly stored as factor.")
           objective <- "binary:logistic"
         }else{
@@ -96,38 +86,30 @@ xgboostImpute <- function(formula, data, imp_var = TRUE,
 
 
       mm <- model.matrix(form,dattmp)
-      if(!is.null(num_class)){
-        mod <- xgboost::xgboost(data = mm, label = labtmp,
-                                nrounds=nrounds, objective=objective, num_class = num_class, verbose = verbose, ...)
-      }else{
-        mod <- xgboost::xgboost(data = mm, label = labtmp,
-                                nrounds=nrounds, objective=objective, verbose = verbose, ...)
-      }
+      mod <- xgboost::xgboost(x = mm, y = labtmp,
+                              nrounds=nrounds, objective=objective,
+                              verbosity = ifelse(verbose,2,0), ...)
 
       if (verbose)
         message("Evaluating model for ", lhsV, " on ", sum(!rhs_na & lhs_na), " observations")
 
       predictions <- 
         predict(mod, newdata = model.matrix(formPred,subset(data, !rhs_na & lhs_na)), reshape=TRUE)
-
-      if(objective %in% c("binary:logistic","multi:softprob")){
-
-        if(objective =="binary:logistic"){
-          predictions <- cbind(1-predictions,predictions)
-        }
-
+      if(objective =="binary:logistic"){
+        predictions <- levels(labtmp)[as.integer(runif(length(predictions))<=predictions)+1]
+      }
+      if(objective == "multi:softprob"){
         predict_num <- 1:ncol(predictions)
         predictions <- apply(predictions,1,function(z,lev){
           z <- cumsum(z)
           z_lev <- lev[z>runif(1)]
           return(z_lev[1])
-        },lev=predict_num)
-
-        if(is.factor(dattmp[[lhsV]])){
-          predictions <- predict_levels[predictions]
-        }else{
-          predictions <- lvlsInt[predictions]
-        }
+        },lev=levels(labtmp))
+      }
+      if(currentClass=="integer"){
+        predictions <- as.integer(as.character(predictions))
+      }else if(currentClass=="numeric"){
+        predictions <- as.numeric(as.character(predictions))
       }
       data[!rhs_na & lhs_na, ][[lhsV]] <- predictions  
 

diff --git a/inst/tinytest/test_vimpute.R b/inst/tinytest/test_vimpute.R
@@ -1,55 +1,149 @@
 library(VIM)
 
-x <- vimpute(sleep, method="ranger", sequential = FALSE, imp_var=TRUE)
-# xm <- vimpute(sleep, method="ranger", sequential = FALSE, imp_var=TRUE, median = TRUE)
-# 
-# y <- vimpute(sleep, method="ranger", sequential = TRUE, imp_var=TRUE)
-# z <- vimpute(sleep, method="ranger", sequential = TRUE, imp_var=TRUE,
-#              imputation_uncertainty ="PMM_1")
-# z <- vimpute(sleep, method="ranger", sequential = FALSE, imp_var=TRUE,
-#              imputation_uncertainty ="PMM_5")
-# 
-# 
-# a <- vimpute(sleep, method="xgboost", sequential = FALSE, imp_var=TRUE)
-# b <- vimpute(sleep, method="xgboost", sequential = TRUE, imp_var=TRUE)
-# c <- vimpute(sleep, method="xgboost", sequential = TRUE, imp_var=TRUE,
-#              imputation_uncertainty ="PMM_1")
-# 
-# 
-# a <- vimpute(sleep, method="regression", sequential = FALSE, imp_var=TRUE)
-# b <- vimpute(sleep, method="regression", sequential = TRUE, imp_var=TRUE)
-# c <- vimpute(sleep, method="regression", sequential = TRUE, imp_var=TRUE,
-#              imputation_uncertainty ="PMM_3")
-# 
-# a <- vimpute(sleep, method="robust", sequential = FALSE, imp_var=TRUE)
-# b <- vimpute(sleep, method="robust", sequential = TRUE, imp_var=TRUE)
-# c <- vimpute(sleep, method="robust", sequential = TRUE, imp_var=TRUE,
-#              imputation_uncertainty ="PMM_3")
-# 
-# 
-# a <- vimpute(sleep, method="robust", sequential = FALSE, imp_var=TRUE)
-# b <- vimpute(sleep, method="robust", sequential = TRUE, imp_var=TRUE)
-# c <- vimpute(sleep, method="robust", sequential = TRUE, imp_var=TRUE,
-#              imputation_uncertainty ="PMM_3", model_uncertainty = "bootstrap", nboot = 50)
-# 
-# df <- iris
-# colnames(df) <- c("S.Length","S.Width","P.Length","P.Width","Species")
-# # randomly produce some missing values in the data
-# set.seed(1)
-# nbr_missing <- 50
-# y <- data.frame(row = sample(nrow(iris), size = nbr_missing, replace = TRUE),
-#                 col = sample(ncol(iris), size = nbr_missing, replace = TRUE))
-# y<-y[!duplicated(y), ]
-# df[as.matrix(y)] <- NA
-# 
-# test8 <- vimpute(data=df, formula = list(S.Length=~S.Width+P.Length), variable = c("S.Length"),
-#                  method="regression", imp_var=FALSE, sequential = FALSE, imp_suffix="imp")
-# test9 <- vimpute(data=df, formula = list(S.Length=~S.Width+P.Length), variable = c("S.Length"), imputation_uncertainty = "PMM_3", 
-#                  method="regression", imp_var=FALSE, sequential = FALSE, imp_suffix="imp")
-# test10 <- vimpute(data=df, formula = list(S.Length=~S.Width+P.Length), variable = c("S.Length"),
-#                   method="regression", imp_var=FALSE, nseq=10, sequential = TRUE, imp_suffix="imp")
-# test11 <- vimpute(data=df, formula = list(S.Length=~S.Width+P.Length), variable = c("S.Length"), imputation_uncertainty = "PMM_3", 
-#                   method="regression", imp_var=FALSE, nseq=10, sequential = TRUE, imp_suffix="imp")
-# test11 <- vimpute(data=df, formula = list(S.Length=~S.Width+P.Length), variable = c("S.Length"), imputation_uncertainty = "PMM_3", 
-#                   method="ranger", imp_var=FALSE, nseq=10, sequential = TRUE, imp_suffix="imp")
+# vimpute returns imputed data and *_imp indicators", {
+  set.seed(1)
+  out <- vimpute(sleep, method = "ranger", sequential = FALSE, imp_var = TRUE)
+
+  expect_true(inherits(out, "data.frame"))
+  expect_identical(nrow(out), nrow(sleep))
+  expect_true(all(colnames(sleep) %in% colnames(out)))
+  expect_true(any(grepl("_imp$", colnames(out))))
+  expect_equal(sum(is.na(out[, colnames(sleep), with = FALSE])), 0)
+# 
+
+# vimpute without imp_var returns no *_imp columns", {
+  set.seed(1)
+  out <- vimpute(sleep, method = "ranger", sequential = FALSE, imp_var = FALSE)
+
+  expect_identical(nrow(out), nrow(sleep))
+  expect_false(any(grepl("_imp$", colnames(out))))
+  expect_equal(sum(is.na(out)), 0)
+# 
+
+# vimpute returns prediction history when requested", {
+  set.seed(1)
+  out <- vimpute(
+    sleep,
+    method = "ranger",
+    sequential = FALSE,
+    nseq = 3,
+    pred_history = TRUE
+  )
+
+  expect_true(is.list(out))
+  expect_true(all(c("data", "pred_history") %in% names(out)))
+  expect_true(inherits(out$data, "data.frame"))
+  expect_true(all(c("iteration", "variable", "index", "predicted_values") %in% names(out$pred_history)))
+  expect_true(nrow(out$pred_history) > 0)
+  expect_equal(max(out$pred_history$iteration), 1)
+# 
+
+# vimpute imputes mixed numeric and factor targets", {
+  set.seed(42)
+  d <- data.frame(
+    x = rnorm(40),
+    z = rnorm(40)
+  )
+  d$grp <- factor(ifelse(d$x > 0, "A", "B"))
+  d$y <- d$x + d$z
+  d$y[sample(40, 6)] <- NA
+  d$grp[sample(40, 5)] <- NA
+
+  out <- vimpute(d, method = "ranger", sequential = FALSE, imp_var = TRUE)
+
+  expect_equal(sum(is.na(out$y)), 0)
+  expect_equal(sum(is.na(out$grp)), 0)
+  expect_true(is.factor(out$grp))
+  expect_true(all(c("y_imp", "grp_imp") %in% names(out)))
+# 
+
+# vimpute validates method values", {
+  d <- sleep[1:20, c("Sleep", "Dream", "Span")]
+  expect_error(vimpute(d, method = list("invalid"), sequential = FALSE))
+# 
+
+# vimpute validates pmm length/type", {
+  d <- sleep[1:20, c("Sleep", "Dream", "Span")]
+  expect_error(vimpute(d, pmm = list(TRUE), method = "ranger", sequential = FALSE))
+# 
+
+# vimpute requires missing values in input", {
+  expect_error(vimpute(iris, method = "ranger", sequential = FALSE))
+# 
+
+# vimpute validates formula type", {
+  d <- sleep[1:20, c("Sleep", "Dream", "Span")]
+  expect_error(vimpute(d, formula = ~Dream + Span, method = "ranger", sequential = FALSE))
+# 
+# 
+
+# vimpute supports considered_variables subsets", {
+  vars <- c("Sleep", "Dream", "Span")
+  set.seed(1)
+  out <- vimpute(
+    sleep,
+    considered_variables = vars,
+    method = "ranger",
+    sequential = FALSE,
+    imp_var = TRUE
+  )
+
+  expect_true(all(vars %in% names(out)))
+  expect_false(any(c("BodyWgt", "BrainWgt") %in% names(out)))
+  expect_equal(sum(is.na(out[, vars, with = FALSE])), 0)
+# 
+
+# vimpute accepts pmm list named for NA variables only", {
+  pmm_na_vars <- setNames(
+    as.list(rep(FALSE, 5)),
+    c("NonD", "Dream", "Sleep", "Span", "Gest")
+  )
+  set.seed(1)
+  out <- vimpute(
+    sleep,
+    method = "ranger",
+    pmm = pmm_na_vars,
+    sequential = FALSE,
+    imp_var = FALSE
+  )
+
+  expect_identical(nrow(out), nrow(sleep))
+  expect_equal(sum(is.na(out)), 0)
+# 
+
+# vimpute runs multiple sequential iterations when configured", {
+  set.seed(1)
+  out <- vimpute(
+    sleep,
+    method = "ranger",
+    sequential = TRUE,
+    nseq = 2,
+    eps = -1,
+    pred_history = TRUE
+  )
+
+  expect_equal(max(out$pred_history$iteration), 2)
+  expect_equal(length(unique(out$pred_history$iteration)), 2)
+# 
+
+# vimpute rejects formulas for unsupported methods", {
+  d <- sleep[1:20, c("Sleep", "Dream", "Span", "BodyWgt")]
+  expect_error(
+    vimpute(
+      d,
+      method = "ranger",
+      pmm = FALSE,
+      formula = list(Sleep ~ Dream + Span + BodyWgt),
+      sequential = FALSE
+    )
+  )
+# 
+
+# vimpute warns on columns with more than 50% missing values", {
+  d <- sleep[1:20, c("Sleep", "Dream", "Span")]
+  d$Dream[1:15] <- NA
+  expect_warning(
+    vimpute(d, method = "ranger", sequential = FALSE, imp_var = FALSE),
+    "more than 50% missing values"
+  )
 # 
diff --git a/inst/tinytest/test_xgboostImpute.R b/inst/tinytest/test_xgboostImpute.R
@@ -4,7 +4,8 @@ x <- rnorm(100)
 df <- data.frame(
   y = x + rnorm(100, sd = .01),
   x = x,
-  fac = as.factor(x >= 0)
+  fac = as.factor(x >= 0),
+  facM = as.factor(abs(round(x)))
 )
 
 max_dist <- function(x, y) {
@@ -13,6 +14,7 @@ max_dist <- function(x, y) {
 
 df$y[1:3] <- NA
 df$fac[3:5] <- NA
+df$facM[3:5] <- NA
 df$binNum <- as.integer(df$fac)+17
 df$binInt <- as.integer(df$fac)+17L
 # xgboostImpute accuracy", {
@@ -25,12 +27,17 @@ df$binInt <- as.integer(df$fac)+17L
   # xgboostImpute should do nothing for no missings", {
   df.out <- xgboostImpute(x ~ y, df)
   expect_identical(df.out$x, df$x)
-# 
 
-# factor response predicted accurately", {
+  # two-level factor response predicted accurately", {
+  set.seed(1)
   df.out <- xgboostImpute(fac ~ x, df)
   expect_identical(df.out$fac, as.factor(df$x >= 0))
-# 
+
+  # three-level factor response predicted accurately", {
+  set.seed(1)
+  df.out <- xgboostImpute(facM ~ x, df)
+  expect_identical(df.out$facM, as.factor(abs(round(df$x))))
+
 
   # interger binary response predicted accurately", {
   expect_warning(df.out <- xgboostImpute(binInt ~ x, df))
@@ -45,5 +52,8 @@ df$binInt <- as.integer(df$fac)+17L
   df2$x[1:10] <- NA
   df.out <- xgboostImpute(x ~ fac, df2)
   expect_identical(as.factor(df.out$x >= 0), df$fac)
+  # with verbose enabled.
+  df.out <- xgboostImpute(x ~ fac, df2, verbose = TRUE)
+  expect_identical(as.factor(df.out$x >= 0), df$fac)
 #