Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions .github/workflows/test-coverage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ on:

name: test-coverage.yaml

permissions: read-all
permissions:
contents: read
id-token: write

jobs:
test-coverage:
Expand Down Expand Up @@ -40,12 +42,11 @@ jobs:

- uses: codecov/codecov-action@v5
with:
# Fail if error if not on PR, or if on PR and token is given
fail_ci_if_error: ${{ github.event_name != 'pull_request' || secrets.CODECOV_TOKEN }}
fail_ci_if_error: true
files: ./cobertura.xml
plugins: noop
disable_search: true
token: ${{ secrets.CODECOV_TOKEN }}
use_oidc: true

- name: Show testthat output
if: always()
Expand All @@ -59,4 +60,4 @@ jobs:
uses: actions/upload-artifact@v4
with:
name: coverage-test-failures
path: ${{ runner.temp }}/package
path: ${{ runner.temp }}/package
50 changes: 16 additions & 34 deletions R/xgboostImpute.R
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ xgboostImpute <- function(formula, data, imp_var = TRUE,
# formula without left side for prediction
formPred <- as.formula(paste( "~", rhs,"-1"))
lhs_vector <- data[[lhsV]]
num_class <- NULL
if (!any(is.na(lhs_vector))) {
cat(paste0("No missings in ", lhsV, ".\n"))
} else {
Expand All @@ -59,24 +58,16 @@ xgboostImpute <- function(formula, data, imp_var = TRUE,
currentClass <- NULL
if(inherits(labtmp,"factor")){
currentClass <- "factor"

predict_levels <- levels(labtmp)
labtmp <- as.integer(labtmp)-1
if(length(unique(labtmp))==2){
if(length(levels(labtmp))==2){
objective <- "binary:logistic"
predict_levels <- predict_levels[unique(labtmp)+1]
labtmp <- as.integer(as.factor(labtmp))-1

}else if(length(unique(labtmp))>2){
objective <- "multi:softprob"
num_class <- max(labtmp)+1
}

}else if(inherits(labtmp,"integer")){
currentClass <- "integer"
if(length(unique(labtmp))==2){
lvlsInt <- unique(labtmp)
labtmp <- match(labtmp,lvlsInt)-1
labtmp <- as.factor(labtmp)
warning("binary factor detected but not properly stored as factor.")
objective <- "binary:logistic"
}else{
Expand All @@ -85,8 +76,7 @@ xgboostImpute <- function(formula, data, imp_var = TRUE,
}else if(inherits(labtmp,"numeric")){
currentClass <- "numeric"
if(length(unique(labtmp))==2){
lvlsInt <- unique(labtmp)
labtmp <- match(labtmp,lvlsInt)-1
labtmp <- as.factor(labtmp)
warning("binary factor detected but not properly stored as factor.")
objective <- "binary:logistic"
}else{
Expand All @@ -96,38 +86,30 @@ xgboostImpute <- function(formula, data, imp_var = TRUE,


mm <- model.matrix(form,dattmp)
if(!is.null(num_class)){
mod <- xgboost::xgboost(data = mm, label = labtmp,
nrounds=nrounds, objective=objective, num_class = num_class, verbose = verbose, ...)
}else{
mod <- xgboost::xgboost(data = mm, label = labtmp,
nrounds=nrounds, objective=objective, verbose = verbose, ...)
}
mod <- xgboost::xgboost(x = mm, y = labtmp,
nrounds=nrounds, objective=objective,
verbosity = ifelse(verbose,2,0), ...)

if (verbose)
message("Evaluating model for ", lhsV, " on ", sum(!rhs_na & lhs_na), " observations")

predictions <-
predict(mod, newdata = model.matrix(formPred,subset(data, !rhs_na & lhs_na)), reshape=TRUE)

if(objective %in% c("binary:logistic","multi:softprob")){

if(objective =="binary:logistic"){
predictions <- cbind(1-predictions,predictions)
}

if(objective =="binary:logistic"){
predictions <- levels(labtmp)[as.integer(runif(length(predictions))<=predictions)+1]
}
if(objective == "multi:softprob"){
predict_num <- 1:ncol(predictions)
predictions <- apply(predictions,1,function(z,lev){
z <- cumsum(z)
z_lev <- lev[z>runif(1)]
return(z_lev[1])
},lev=predict_num)

if(is.factor(dattmp[[lhsV]])){
predictions <- predict_levels[predictions]
}else{
predictions <- lvlsInt[predictions]
}
},lev=levels(labtmp))
}
if(currentClass=="integer"){
predictions <- as.integer(as.character(predictions))
}else if(currentClass=="numeric"){
predictions <- as.numeric(as.character(predictions))
}
data[!rhs_na & lhs_na, ][[lhsV]] <- predictions

Expand Down
198 changes: 146 additions & 52 deletions inst/tinytest/test_vimpute.R
Original file line number Diff line number Diff line change
@@ -1,55 +1,149 @@
library(VIM)

x <- vimpute(sleep, method="ranger", sequential = FALSE, imp_var=TRUE)
# xm <- vimpute(sleep, method="ranger", sequential = FALSE, imp_var=TRUE, median = TRUE)
#
# y <- vimpute(sleep, method="ranger", sequential = TRUE, imp_var=TRUE)
# z <- vimpute(sleep, method="ranger", sequential = TRUE, imp_var=TRUE,
# imputation_uncertainty ="PMM_1")
# z <- vimpute(sleep, method="ranger", sequential = FALSE, imp_var=TRUE,
# imputation_uncertainty ="PMM_5")
#
#
# a <- vimpute(sleep, method="xgboost", sequential = FALSE, imp_var=TRUE)
# b <- vimpute(sleep, method="xgboost", sequential = TRUE, imp_var=TRUE)
# c <- vimpute(sleep, method="xgboost", sequential = TRUE, imp_var=TRUE,
# imputation_uncertainty ="PMM_1")
#
#
# a <- vimpute(sleep, method="regression", sequential = FALSE, imp_var=TRUE)
# b <- vimpute(sleep, method="regression", sequential = TRUE, imp_var=TRUE)
# c <- vimpute(sleep, method="regression", sequential = TRUE, imp_var=TRUE,
# imputation_uncertainty ="PMM_3")
#
# a <- vimpute(sleep, method="robust", sequential = FALSE, imp_var=TRUE)
# b <- vimpute(sleep, method="robust", sequential = TRUE, imp_var=TRUE)
# c <- vimpute(sleep, method="robust", sequential = TRUE, imp_var=TRUE,
# imputation_uncertainty ="PMM_3")
#
#
# a <- vimpute(sleep, method="robust", sequential = FALSE, imp_var=TRUE)
# b <- vimpute(sleep, method="robust", sequential = TRUE, imp_var=TRUE)
# c <- vimpute(sleep, method="robust", sequential = TRUE, imp_var=TRUE,
# imputation_uncertainty ="PMM_3", model_uncertainty = "bootstrap", nboot = 50)
#
# df <- iris
# colnames(df) <- c("S.Length","S.Width","P.Length","P.Width","Species")
# # randomly produce some missing values in the data
# set.seed(1)
# nbr_missing <- 50
# y <- data.frame(row = sample(nrow(iris), size = nbr_missing, replace = TRUE),
# col = sample(ncol(iris), size = nbr_missing, replace = TRUE))
# y<-y[!duplicated(y), ]
# df[as.matrix(y)] <- NA
#
# test8 <- vimpute(data=df, formula = list(S.Length=~S.Width+P.Length), variable = c("S.Length"),
# method="regression", imp_var=FALSE, sequential = FALSE, imp_suffix="imp")
# test9 <- vimpute(data=df, formula = list(S.Length=~S.Width+P.Length), variable = c("S.Length"), imputation_uncertainty = "PMM_3",
# method="regression", imp_var=FALSE, sequential = FALSE, imp_suffix="imp")
# test10 <- vimpute(data=df, formula = list(S.Length=~S.Width+P.Length), variable = c("S.Length"),
# method="regression", imp_var=FALSE, nseq=10, sequential = TRUE, imp_suffix="imp")
# test11 <- vimpute(data=df, formula = list(S.Length=~S.Width+P.Length), variable = c("S.Length"), imputation_uncertainty = "PMM_3",
# method="regression", imp_var=FALSE, nseq=10, sequential = TRUE, imp_suffix="imp")
# test11 <- vimpute(data=df, formula = list(S.Length=~S.Width+P.Length), variable = c("S.Length"), imputation_uncertainty = "PMM_3",
# method="ranger", imp_var=FALSE, nseq=10, sequential = TRUE, imp_suffix="imp")
# vimpute returns imputed data and *_imp indicators", {
set.seed(1)
out <- vimpute(sleep, method = "ranger", sequential = FALSE, imp_var = TRUE)

expect_true(inherits(out, "data.frame"))
expect_identical(nrow(out), nrow(sleep))
expect_true(all(colnames(sleep) %in% colnames(out)))
expect_true(any(grepl("_imp$", colnames(out))))
expect_equal(sum(is.na(out[, colnames(sleep), with = FALSE])), 0)
#

# vimpute without imp_var returns no *_imp columns", {
set.seed(1)
out <- vimpute(sleep, method = "ranger", sequential = FALSE, imp_var = FALSE)

expect_identical(nrow(out), nrow(sleep))
expect_false(any(grepl("_imp$", colnames(out))))
expect_equal(sum(is.na(out)), 0)
#

# vimpute returns prediction history when requested", {
set.seed(1)
out <- vimpute(
sleep,
method = "ranger",
sequential = FALSE,
nseq = 3,
pred_history = TRUE
)

expect_true(is.list(out))
expect_true(all(c("data", "pred_history") %in% names(out)))
expect_true(inherits(out$data, "data.frame"))
expect_true(all(c("iteration", "variable", "index", "predicted_values") %in% names(out$pred_history)))
expect_true(nrow(out$pred_history) > 0)
expect_equal(max(out$pred_history$iteration), 1)
#

# vimpute imputes mixed numeric and factor targets", {
set.seed(42)
d <- data.frame(
x = rnorm(40),
z = rnorm(40)
)
d$grp <- factor(ifelse(d$x > 0, "A", "B"))
d$y <- d$x + d$z
d$y[sample(40, 6)] <- NA
d$grp[sample(40, 5)] <- NA

out <- vimpute(d, method = "ranger", sequential = FALSE, imp_var = TRUE)

expect_equal(sum(is.na(out$y)), 0)
expect_equal(sum(is.na(out$grp)), 0)
expect_true(is.factor(out$grp))
expect_true(all(c("y_imp", "grp_imp") %in% names(out)))
#

# vimpute validates method values", {
d <- sleep[1:20, c("Sleep", "Dream", "Span")]
expect_error(vimpute(d, method = list("invalid"), sequential = FALSE))
#

# vimpute validates pmm length/type", {
d <- sleep[1:20, c("Sleep", "Dream", "Span")]
expect_error(vimpute(d, pmm = list(TRUE), method = "ranger", sequential = FALSE))
#

# vimpute requires missing values in input", {
expect_error(vimpute(iris, method = "ranger", sequential = FALSE))
#

# vimpute validates formula type", {
d <- sleep[1:20, c("Sleep", "Dream", "Span")]
expect_error(vimpute(d, formula = ~Dream + Span, method = "ranger", sequential = FALSE))
#
#

# vimpute supports considered_variables subsets", {
vars <- c("Sleep", "Dream", "Span")
set.seed(1)
out <- vimpute(
sleep,
considered_variables = vars,
method = "ranger",
sequential = FALSE,
imp_var = TRUE
)

expect_true(all(vars %in% names(out)))
expect_false(any(c("BodyWgt", "BrainWgt") %in% names(out)))
expect_equal(sum(is.na(out[, vars, with = FALSE])), 0)
#

# vimpute accepts pmm list named for NA variables only", {
pmm_na_vars <- setNames(
as.list(rep(FALSE, 5)),
c("NonD", "Dream", "Sleep", "Span", "Gest")
)
set.seed(1)
out <- vimpute(
sleep,
method = "ranger",
pmm = pmm_na_vars,
sequential = FALSE,
imp_var = FALSE
)

expect_identical(nrow(out), nrow(sleep))
expect_equal(sum(is.na(out)), 0)
#

# vimpute runs multiple sequential iterations when configured", {
set.seed(1)
out <- vimpute(
sleep,
method = "ranger",
sequential = TRUE,
nseq = 2,
eps = -1,
pred_history = TRUE
)

expect_equal(max(out$pred_history$iteration), 2)
expect_equal(length(unique(out$pred_history$iteration)), 2)
#

# vimpute rejects formulas for unsupported methods", {
d <- sleep[1:20, c("Sleep", "Dream", "Span", "BodyWgt")]
expect_error(
vimpute(
d,
method = "ranger",
pmm = FALSE,
formula = list(Sleep ~ Dream + Span + BodyWgt),
sequential = FALSE
)
)
#

# vimpute warns on columns with more than 50% missing values", {
d <- sleep[1:20, c("Sleep", "Dream", "Span")]
d$Dream[1:15] <- NA
expect_warning(
vimpute(d, method = "ranger", sequential = FALSE, imp_var = FALSE),
"more than 50% missing values"
)
#
18 changes: 14 additions & 4 deletions inst/tinytest/test_xgboostImpute.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ x <- rnorm(100)
df <- data.frame(
y = x + rnorm(100, sd = .01),
x = x,
fac = as.factor(x >= 0)
fac = as.factor(x >= 0),
facM = as.factor(abs(round(x)))
)

max_dist <- function(x, y) {
Expand All @@ -13,6 +14,7 @@ max_dist <- function(x, y) {

df$y[1:3] <- NA
df$fac[3:5] <- NA
df$facM[3:5] <- NA
df$binNum <- as.integer(df$fac)+17
df$binInt <- as.integer(df$fac)+17L
# xgboostImpute accuracy", {
Expand All @@ -25,12 +27,17 @@ df$binInt <- as.integer(df$fac)+17L
# xgboostImpute should do nothing for no missings", {
df.out <- xgboostImpute(x ~ y, df)
expect_identical(df.out$x, df$x)
#

# factor response predicted accurately", {
# two-level factor response predicted accurately", {
set.seed(1)
df.out <- xgboostImpute(fac ~ x, df)
expect_identical(df.out$fac, as.factor(df$x >= 0))
#

# three-level factor response predicted accurately", {
set.seed(1)
df.out <- xgboostImpute(facM ~ x, df)
expect_identical(df.out$facM, as.factor(abs(round(df$x))))


# interger binary response predicted accurately", {
expect_warning(df.out <- xgboostImpute(binInt ~ x, df))
Expand All @@ -45,5 +52,8 @@ df$binInt <- as.integer(df$fac)+17L
df2$x[1:10] <- NA
df.out <- xgboostImpute(x ~ fac, df2)
expect_identical(as.factor(df.out$x >= 0), df$fac)
# with verbose enabled.
df.out <- xgboostImpute(x ~ fac, df2, verbose = TRUE)
expect_identical(as.factor(df.out$x >= 0), df$fac)
#

Loading