diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index f9cb681..eaaf1e8 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -7,7 +7,9 @@ on: name: test-coverage.yaml -permissions: read-all +permissions: + contents: read + id-token: write jobs: test-coverage: @@ -40,12 +42,11 @@ jobs: - uses: codecov/codecov-action@v5 with: - # Fail if error if not on PR, or if on PR and token is given - fail_ci_if_error: ${{ github.event_name != 'pull_request' || secrets.CODECOV_TOKEN }} + fail_ci_if_error: true files: ./cobertura.xml plugins: noop disable_search: true - token: ${{ secrets.CODECOV_TOKEN }} + use_oidc: true - name: Show testthat output if: always() @@ -59,4 +60,4 @@ jobs: uses: actions/upload-artifact@v4 with: name: coverage-test-failures - path: ${{ runner.temp }}/package \ No newline at end of file + path: ${{ runner.temp }}/package diff --git a/R/xgboostImpute.R b/R/xgboostImpute.R index e731f95..7785be2 100644 --- a/R/xgboostImpute.R +++ b/R/xgboostImpute.R @@ -47,7 +47,6 @@ xgboostImpute <- function(formula, data, imp_var = TRUE, # formula without left side for prediction formPred <- as.formula(paste( "~", rhs,"-1")) lhs_vector <- data[[lhsV]] - num_class <- NULL if (!any(is.na(lhs_vector))) { cat(paste0("No missings in ", lhsV, ".\n")) } else { @@ -59,24 +58,16 @@ xgboostImpute <- function(formula, data, imp_var = TRUE, currentClass <- NULL if(inherits(labtmp,"factor")){ currentClass <- "factor" - - predict_levels <- levels(labtmp) - labtmp <- as.integer(labtmp)-1 - if(length(unique(labtmp))==2){ + if(length(levels(labtmp))==2){ objective <- "binary:logistic" - predict_levels <- predict_levels[unique(labtmp)+1] - labtmp <- as.integer(as.factor(labtmp))-1 - }else if(length(unique(labtmp))>2){ objective <- "multi:softprob" - num_class <- max(labtmp)+1 } }else if(inherits(labtmp,"integer")){ currentClass <- "integer" if(length(unique(labtmp))==2){ - lvlsInt <- unique(labtmp) - labtmp <- match(labtmp,lvlsInt)-1 + labtmp <- as.factor(labtmp) warning("binary factor detected but not properly stored as factor.") objective <- "binary:logistic" }else{ @@ -85,8 +76,7 @@ xgboostImpute <- function(formula, data, imp_var = TRUE, }else if(inherits(labtmp,"numeric")){ currentClass <- "numeric" if(length(unique(labtmp))==2){ - lvlsInt <- unique(labtmp) - labtmp <- match(labtmp,lvlsInt)-1 + labtmp <- as.factor(labtmp) warning("binary factor detected but not properly stored as factor.") objective <- "binary:logistic" }else{ @@ -96,38 +86,30 @@ xgboostImpute <- function(formula, data, imp_var = TRUE, mm <- model.matrix(form,dattmp) - if(!is.null(num_class)){ - mod <- xgboost::xgboost(data = mm, label = labtmp, - nrounds=nrounds, objective=objective, num_class = num_class, verbose = verbose, ...) - }else{ - mod <- xgboost::xgboost(data = mm, label = labtmp, - nrounds=nrounds, objective=objective, verbose = verbose, ...) - } + mod <- xgboost::xgboost(x = mm, y = labtmp, + nrounds=nrounds, objective=objective, + verbosity = ifelse(verbose,2,0), ...) if (verbose) message("Evaluating model for ", lhsV, " on ", sum(!rhs_na & lhs_na), " observations") predictions <- predict(mod, newdata = model.matrix(formPred,subset(data, !rhs_na & lhs_na)), reshape=TRUE) - - if(objective %in% c("binary:logistic","multi:softprob")){ - - if(objective =="binary:logistic"){ - predictions <- cbind(1-predictions,predictions) - } - + if(objective =="binary:logistic"){ + predictions <- levels(labtmp)[as.integer(runif(length(predictions))<=predictions)+1] + } + if(objective == "multi:softprob"){ predict_num <- 1:ncol(predictions) predictions <- apply(predictions,1,function(z,lev){ z <- cumsum(z) z_lev <- lev[z>runif(1)] return(z_lev[1]) - },lev=predict_num) - - if(is.factor(dattmp[[lhsV]])){ - predictions <- predict_levels[predictions] - }else{ - predictions <- lvlsInt[predictions] - } + },lev=levels(labtmp)) + } + if(currentClass=="integer"){ + predictions <- as.integer(as.character(predictions)) + }else if(currentClass=="numeric"){ + predictions <- as.numeric(as.character(predictions)) } data[!rhs_na & lhs_na, ][[lhsV]] <- predictions diff --git a/inst/tinytest/test_vimpute.R b/inst/tinytest/test_vimpute.R index 4f2b400..2be6d74 100644 --- a/inst/tinytest/test_vimpute.R +++ b/inst/tinytest/test_vimpute.R @@ -1,55 +1,149 @@ library(VIM) -x <- vimpute(sleep, method="ranger", sequential = FALSE, imp_var=TRUE) -# xm <- vimpute(sleep, method="ranger", sequential = FALSE, imp_var=TRUE, median = TRUE) -# -# y <- vimpute(sleep, method="ranger", sequential = TRUE, imp_var=TRUE) -# z <- vimpute(sleep, method="ranger", sequential = TRUE, imp_var=TRUE, -# imputation_uncertainty ="PMM_1") -# z <- vimpute(sleep, method="ranger", sequential = FALSE, imp_var=TRUE, -# imputation_uncertainty ="PMM_5") -# -# -# a <- vimpute(sleep, method="xgboost", sequential = FALSE, imp_var=TRUE) -# b <- vimpute(sleep, method="xgboost", sequential = TRUE, imp_var=TRUE) -# c <- vimpute(sleep, method="xgboost", sequential = TRUE, imp_var=TRUE, -# imputation_uncertainty ="PMM_1") -# -# -# a <- vimpute(sleep, method="regression", sequential = FALSE, imp_var=TRUE) -# b <- vimpute(sleep, method="regression", sequential = TRUE, imp_var=TRUE) -# c <- vimpute(sleep, method="regression", sequential = TRUE, imp_var=TRUE, -# imputation_uncertainty ="PMM_3") -# -# a <- vimpute(sleep, method="robust", sequential = FALSE, imp_var=TRUE) -# b <- vimpute(sleep, method="robust", sequential = TRUE, imp_var=TRUE) -# c <- vimpute(sleep, method="robust", sequential = TRUE, imp_var=TRUE, -# imputation_uncertainty ="PMM_3") -# -# -# a <- vimpute(sleep, method="robust", sequential = FALSE, imp_var=TRUE) -# b <- vimpute(sleep, method="robust", sequential = TRUE, imp_var=TRUE) -# c <- vimpute(sleep, method="robust", sequential = TRUE, imp_var=TRUE, -# imputation_uncertainty ="PMM_3", model_uncertainty = "bootstrap", nboot = 50) -# -# df <- iris -# colnames(df) <- c("S.Length","S.Width","P.Length","P.Width","Species") -# # randomly produce some missing values in the data -# set.seed(1) -# nbr_missing <- 50 -# y <- data.frame(row = sample(nrow(iris), size = nbr_missing, replace = TRUE), -# col = sample(ncol(iris), size = nbr_missing, replace = TRUE)) -# y<-y[!duplicated(y), ] -# df[as.matrix(y)] <- NA -# -# test8 <- vimpute(data=df, formula = list(S.Length=~S.Width+P.Length), variable = c("S.Length"), -# method="regression", imp_var=FALSE, sequential = FALSE, imp_suffix="imp") -# test9 <- vimpute(data=df, formula = list(S.Length=~S.Width+P.Length), variable = c("S.Length"), imputation_uncertainty = "PMM_3", -# method="regression", imp_var=FALSE, sequential = FALSE, imp_suffix="imp") -# test10 <- vimpute(data=df, formula = list(S.Length=~S.Width+P.Length), variable = c("S.Length"), -# method="regression", imp_var=FALSE, nseq=10, sequential = TRUE, imp_suffix="imp") -# test11 <- vimpute(data=df, formula = list(S.Length=~S.Width+P.Length), variable = c("S.Length"), imputation_uncertainty = "PMM_3", -# method="regression", imp_var=FALSE, nseq=10, sequential = TRUE, imp_suffix="imp") -# test11 <- vimpute(data=df, formula = list(S.Length=~S.Width+P.Length), variable = c("S.Length"), imputation_uncertainty = "PMM_3", -# method="ranger", imp_var=FALSE, nseq=10, sequential = TRUE, imp_suffix="imp") +# vimpute returns imputed data and *_imp indicators", { + set.seed(1) + out <- vimpute(sleep, method = "ranger", sequential = FALSE, imp_var = TRUE) + + expect_true(inherits(out, "data.frame")) + expect_identical(nrow(out), nrow(sleep)) + expect_true(all(colnames(sleep) %in% colnames(out))) + expect_true(any(grepl("_imp$", colnames(out)))) + expect_equal(sum(is.na(out[, colnames(sleep), with = FALSE])), 0) +# + +# vimpute without imp_var returns no *_imp columns", { + set.seed(1) + out <- vimpute(sleep, method = "ranger", sequential = FALSE, imp_var = FALSE) + + expect_identical(nrow(out), nrow(sleep)) + expect_false(any(grepl("_imp$", colnames(out)))) + expect_equal(sum(is.na(out)), 0) +# + +# vimpute returns prediction history when requested", { + set.seed(1) + out <- vimpute( + sleep, + method = "ranger", + sequential = FALSE, + nseq = 3, + pred_history = TRUE + ) + + expect_true(is.list(out)) + expect_true(all(c("data", "pred_history") %in% names(out))) + expect_true(inherits(out$data, "data.frame")) + expect_true(all(c("iteration", "variable", "index", "predicted_values") %in% names(out$pred_history))) + expect_true(nrow(out$pred_history) > 0) + expect_equal(max(out$pred_history$iteration), 1) +# + +# vimpute imputes mixed numeric and factor targets", { + set.seed(42) + d <- data.frame( + x = rnorm(40), + z = rnorm(40) + ) + d$grp <- factor(ifelse(d$x > 0, "A", "B")) + d$y <- d$x + d$z + d$y[sample(40, 6)] <- NA + d$grp[sample(40, 5)] <- NA + + out <- vimpute(d, method = "ranger", sequential = FALSE, imp_var = TRUE) + + expect_equal(sum(is.na(out$y)), 0) + expect_equal(sum(is.na(out$grp)), 0) + expect_true(is.factor(out$grp)) + expect_true(all(c("y_imp", "grp_imp") %in% names(out))) +# + +# vimpute validates method values", { + d <- sleep[1:20, c("Sleep", "Dream", "Span")] + expect_error(vimpute(d, method = list("invalid"), sequential = FALSE)) +# + +# vimpute validates pmm length/type", { + d <- sleep[1:20, c("Sleep", "Dream", "Span")] + expect_error(vimpute(d, pmm = list(TRUE), method = "ranger", sequential = FALSE)) +# + +# vimpute requires missing values in input", { + expect_error(vimpute(iris, method = "ranger", sequential = FALSE)) +# + +# vimpute validates formula type", { + d <- sleep[1:20, c("Sleep", "Dream", "Span")] + expect_error(vimpute(d, formula = ~Dream + Span, method = "ranger", sequential = FALSE)) +# +# + +# vimpute supports considered_variables subsets", { + vars <- c("Sleep", "Dream", "Span") + set.seed(1) + out <- vimpute( + sleep, + considered_variables = vars, + method = "ranger", + sequential = FALSE, + imp_var = TRUE + ) + + expect_true(all(vars %in% names(out))) + expect_false(any(c("BodyWgt", "BrainWgt") %in% names(out))) + expect_equal(sum(is.na(out[, vars, with = FALSE])), 0) +# + +# vimpute accepts pmm list named for NA variables only", { + pmm_na_vars <- setNames( + as.list(rep(FALSE, 5)), + c("NonD", "Dream", "Sleep", "Span", "Gest") + ) + set.seed(1) + out <- vimpute( + sleep, + method = "ranger", + pmm = pmm_na_vars, + sequential = FALSE, + imp_var = FALSE + ) + + expect_identical(nrow(out), nrow(sleep)) + expect_equal(sum(is.na(out)), 0) +# + +# vimpute runs multiple sequential iterations when configured", { + set.seed(1) + out <- vimpute( + sleep, + method = "ranger", + sequential = TRUE, + nseq = 2, + eps = -1, + pred_history = TRUE + ) + + expect_equal(max(out$pred_history$iteration), 2) + expect_equal(length(unique(out$pred_history$iteration)), 2) +# + +# vimpute rejects formulas for unsupported methods", { + d <- sleep[1:20, c("Sleep", "Dream", "Span", "BodyWgt")] + expect_error( + vimpute( + d, + method = "ranger", + pmm = FALSE, + formula = list(Sleep ~ Dream + Span + BodyWgt), + sequential = FALSE + ) + ) +# + +# vimpute warns on columns with more than 50% missing values", { + d <- sleep[1:20, c("Sleep", "Dream", "Span")] + d$Dream[1:15] <- NA + expect_warning( + vimpute(d, method = "ranger", sequential = FALSE, imp_var = FALSE), + "more than 50% missing values" + ) # diff --git a/inst/tinytest/test_xgboostImpute.R b/inst/tinytest/test_xgboostImpute.R index 8aafba9..fa99221 100644 --- a/inst/tinytest/test_xgboostImpute.R +++ b/inst/tinytest/test_xgboostImpute.R @@ -4,7 +4,8 @@ x <- rnorm(100) df <- data.frame( y = x + rnorm(100, sd = .01), x = x, - fac = as.factor(x >= 0) + fac = as.factor(x >= 0), + facM = as.factor(abs(round(x))) ) max_dist <- function(x, y) { @@ -13,6 +14,7 @@ max_dist <- function(x, y) { df$y[1:3] <- NA df$fac[3:5] <- NA +df$facM[3:5] <- NA df$binNum <- as.integer(df$fac)+17 df$binInt <- as.integer(df$fac)+17L # xgboostImpute accuracy", { @@ -25,12 +27,17 @@ df$binInt <- as.integer(df$fac)+17L # xgboostImpute should do nothing for no missings", { df.out <- xgboostImpute(x ~ y, df) expect_identical(df.out$x, df$x) -# -# factor response predicted accurately", { + # two-level factor response predicted accurately", { + set.seed(1) df.out <- xgboostImpute(fac ~ x, df) expect_identical(df.out$fac, as.factor(df$x >= 0)) -# + + # three-level factor response predicted accurately", { + set.seed(1) + df.out <- xgboostImpute(facM ~ x, df) + expect_identical(df.out$facM, as.factor(abs(round(df$x)))) + # interger binary response predicted accurately", { expect_warning(df.out <- xgboostImpute(binInt ~ x, df)) @@ -45,5 +52,8 @@ df$binInt <- as.integer(df$fac)+17L df2$x[1:10] <- NA df.out <- xgboostImpute(x ~ fac, df2) expect_identical(as.factor(df.out$x >= 0), df$fac) + # with verbose enabled. + df.out <- xgboostImpute(x ~ fac, df2, verbose = TRUE) + expect_identical(as.factor(df.out$x >= 0), df$fac) # diff --git a/vignettes/xgboostImpute.Rmd b/vignettes/xgboostImpute.Rmd index 26db516..522aad0 100644 --- a/vignettes/xgboostImpute.Rmd +++ b/vignettes/xgboostImpute.Rmd @@ -5,8 +5,10 @@ date: "2024-07-08" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Imputation Method based on xgboost} - %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + chunk_output_type: console --- @@ -87,10 +89,9 @@ df <- iris colnames(df) <- c("S.Length","S.Width","P.Length","P.Width","Species") # randomly produce some missing values in the data set.seed(1) -nbr_missing <- 50 -y <- data.frame(row=sample(nrow(iris),size = nbr_missing,replace = T), - col=sample(ncol(iris)-1,size = nbr_missing,replace = T)) -y<-y[!duplicated(y),] +nbr_missing <- 48 +y <- data.frame(row=sample(nrow(iris),size = nbr_missing,replace = FALSE), + col=sample(rep(1:4,12))) df[as.matrix(y)]<-NA aggr(df)