diff --git a/.gitignore b/.gitignore index 32840e7..612821b 100755 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,4 @@ test_print_cohorts_direct.R test_rcs_all_methods.R test_inffunc.R /.quarto/ +..Rcheck/00check.log diff --git a/DESCRIPTION b/DESCRIPTION index 39334e6..1f30d05 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: triplediff Title: Triple-Difference Estimators -Version: 0.2.0 +Version: 0.2.1 Authors@R: c(person("Marcelo", "Ortiz-Villavicencio", email = "marcelo.ortiz@emory.edu", role = c("aut", "cre")), person("Pedro H. C.", "Sant'Anna", email = "pedro.santanna@emory.edu", role = c("aut")) ) diff --git a/NEWS.md b/NEWS.md index 921c3fa..e455615 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,3 +14,10 @@ * Replaced `parglm` with `fastglm` to avoid issues related to parglm's scheduled archival on 2026-01-29. * Added support for unbalanced panel data and repeated cross-sectional data by properly implementing the `allow_unbalanced_panel` parameter across all functions. + +# triplediff 0.2.1 + * Add asymmetric propensity score trimming for control units with pscore >= 0.995. + * Add partition-specific collinearity detection with two-stage checking. + * Add comprehensive test suite including Monte Carlo coverage test when trimming. + + diff --git a/R/compute_nuisances.R b/R/compute_nuisances.R index 42c98e0..c74ff3a 100755 --- a/R/compute_nuisances.R +++ b/R/compute_nuisances.R @@ -4,8 +4,50 @@ #' @noRd #-------------------------------------------------- +# Helper function to get descriptive comparison name for a subgroup +# Subgroup encoding: +# 4 = Treated + Eligible (focal group) +# 3 = Treated + Ineligible +# 2 = Eligible + Untreated +# 1 = Untreated + Ineligible +get_comparison_description <- function(condition_subgroup) { + switch(as.character(condition_subgroup), + "3" = "Treated-Eligible vs Treated-Ineligible", + "2" = "Treated-Eligible vs Eligible-Untreated", + "1" = "Treated-Eligible vs Untreated-Ineligible" + ) +} + +# Helper function to check overlap and generate descriptive warning messages +# Returns NULL if overlap is acceptable, otherwise returns warning message +check_overlap_and_get_warning <- function(propensity_scores, condition_subgroup, + threshold = 1e-3, max_proportion = 0.05) { + # Calculate overlap diagnostics + n_total <- length(propensity_scores) + n_below_threshold <- sum(propensity_scores < threshold) + prop_below <- n_below_threshold / n_total + + # Warn if more than max_proportion of units have ps below threshold + if (prop_below <= max_proportion) { + return(NULL) + } + + # Build comparison description + comparison_desc <- get_comparison_description(condition_subgroup) + + # Build informative warning message + msg <- paste0( + "Poor propensity score overlap detected.\n", + " Comparison: ", comparison_desc, " units.\n", + " Diagnostics: ", round(prop_below * 100, 1), "% of units have propensity score < ", threshold, ".\n", + " Consider checking covariate balance or using fewer/different covariates." + ) + + return(msg) +} + # Function to compute propensity scores using fastglm for multiple subgroups -compute_pscore <- function(data, condition_subgroup, xformula) { +compute_pscore <- function(data, condition_subgroup, xformula, trim_level = 0.995) { # Subset data for condition_subgroup and subgroup == 4 or the given condition_subgroup condition_data <- data[data$subgroup %in% c(condition_subgroup, 4)] # Adding treatment variable P(1{PA4 = 4}|X) @@ -32,27 +74,39 @@ compute_pscore <- function(data, condition_subgroup, xformula) { # Flag for convergence of glm if (!model$converged) { - warning(paste("Logistic regression model for subgroup", condition_subgroup, - "did not converge.")) + warning(paste0( + "Propensity score model did not converge.\n", + " Comparison: ", get_comparison_description(condition_subgroup), " units.\n", + " Consider using fewer covariates or checking for separation issues." + )) } # Flag for weird coefficients if(anyNA(model$coefficients)) { - stop(paste("Pscore model coefficients for subgroup", condition_subgroup, - "have NA components. Multicollinearity of covariates is a likely reason")) + stop(paste0( + "Propensity score model has NA coefficients.\n", + " Comparison: ", get_comparison_description(condition_subgroup), " units.\n", + " This is likely due to multicollinearity among covariates." + )) } # Compute propensity scores using fitted values propensity_scores <- fitted(model) - # Warning for overlap condition - if (any(propensity_scores < 5e-4)) { - warning(paste("Propensity scores for comparison subgroup", condition_subgroup, - "have poor overlap.")) + # Warning for overlap condition (proportion-based check) + overlap_warning <- check_overlap_and_get_warning(propensity_scores, condition_subgroup) + if (!is.null(overlap_warning)) { + warning(overlap_warning) } - # Avoid divide by zero - propensity_scores <- pmin(propensity_scores, 1 - 1e-16) + # Avoid divide by zero (following DRDID approach) + propensity_scores <- pmin(propensity_scores, 1 - 1e-6) + + # Trimming indicator (following DRDID approach) + # - Treated units (PA4=1): always included (trim.ps = TRUE) + # - Control units (PA4=0): excluded if pscore >= trim_level + keep_ps <- (propensity_scores < 1.01) # Always TRUE initially + keep_ps[PA4 == 0] <- (propensity_scores[PA4 == 0] < trim_level) # Compute Hessian matrix manually (following DRDID approach) # Hessian = chol2inv(chol(t(X) %*% (W * X))) * n @@ -60,7 +114,7 @@ compute_pscore <- function(data, condition_subgroup, xformula) { W <- propensity_scores * (1 - propensity_scores) * i_weights hessian_matrix <- chol2inv(chol(t(covX) %*% (W * covX))) * n - return(list(propensity_scores = propensity_scores, hessian_matrix = hessian_matrix)) + return(list(propensity_scores = propensity_scores, hessian_matrix = hessian_matrix, keep_ps = keep_ps)) } compute_pscore_null <- function(data, condition_subgroup) { @@ -69,10 +123,13 @@ compute_pscore_null <- function(data, condition_subgroup) { condition_data <- data[data$subgroup %in% c(condition_subgroup, 4)] uid_condition_data <- unique(condition_data, by = "id") - propensity_scores <- rep(1, nrow(uid_condition_data)) + n <- nrow(uid_condition_data) + propensity_scores <- rep(1, n) hessian_matrix <- NA + # No trimming needed for REG method - all units included + keep_ps <- rep(TRUE, n) - return(list(propensity_scores = propensity_scores, hessian_matrix = hessian_matrix)) + return(list(propensity_scores = propensity_scores, hessian_matrix = hessian_matrix, keep_ps = keep_ps)) } # Function to compute outcome regression for multiple subgroups @@ -102,8 +159,11 @@ compute_outcome_regression <- function(data, condition_subgroup, xformula){ # Flag for NA coefficients if(anyNA(reg.coeff)) { - stop(paste("Outcome regression model coefficients for subgroup", condition_subgroup, - "have NA components. Multicollinearity of covariates is a likely reason")) + stop(paste0( + "Outcome regression model has NA coefficients.\n", + " Comparison: ", get_comparison_description(condition_subgroup), " units.\n", + " This is likely due to multicollinearity among covariates." + )) } # compute regression adjustment @@ -145,22 +205,25 @@ compute_did <- function(data, condition_subgroup, pscores, reg_adjustment, xform PA4 = ifelse(condition_data$subgroup == 4, 1, 0) PAa = ifelse(condition_data$subgroup == condition_subgroup, 1, 0) - # Compute propensity scores + # Get propensity scores, hessian, trim indicator, and outcome regression if (condition_subgroup == 3) { pscore <- pscores[[1]]$propensity_scores hessian <- pscores[[1]]$hessian_matrix + keep_ps <- pscores[[1]]$keep_ps deltaY <- reg_adjustment[[1]]$deltaY or_delta <- reg_adjustment[[1]]$or_delta } else if (condition_subgroup == 2) { pscore <- pscores[[2]]$propensity_scores hessian <- pscores[[2]]$hessian_matrix + keep_ps <- pscores[[2]]$keep_ps deltaY <- reg_adjustment[[2]]$deltaY or_delta <- reg_adjustment[[2]]$or_delta } else if (condition_subgroup == 1) { pscore <- pscores[[3]]$propensity_scores hessian <- pscores[[3]]$hessian_matrix + keep_ps <- pscores[[3]]$keep_ps deltaY <- reg_adjustment[[3]]$deltaY or_delta <- reg_adjustment[[3]]$or_delta @@ -175,12 +238,15 @@ compute_did <- function(data, condition_subgroup, pscores, reg_adjustment, xform ################################ # Get doubly-robust estimation # ################################ - w_treat = i_weights * PA4 + # Apply trimming indicator to weights (following DRDID approach) + # Control units with pscore >= trim_level get keep_ps = FALSE, so their weight becomes 0 + w_treat = keep_ps * i_weights * PA4 if (est_method == "reg") { - # Compute doubly-robust estimation - w_control = (i_weights * PAa) + # Compute regression-based estimation (no IPW weights) + w_control = keep_ps * (i_weights * PAa) } else { - w_control = (i_weights * pscore * PAa) / (1 - pscore) + # IPW or DR: apply propensity score weighting + w_control = keep_ps * (i_weights * pscore * PAa) / (1 - pscore) } riesz_treat = w_treat * (deltaY - or_delta) riesz_control = w_control * (deltaY - or_delta) @@ -219,6 +285,11 @@ compute_did <- function(data, condition_subgroup, pscores, reg_adjustment, xform or_ex <- i_weights * PAa * (deltaY - or_delta) * covX XpX <- crossprod(or_x, covX)/nrow(condition_data) + # Check if design matrix is singular (following DRDID approach) + if (base::rcond(XpX) < .Machine$double.eps) { + stop("The regression design matrix is singular. Consider removing some covariates.") + } + #asymptotic linear representation of the beta asy_linear_or <- t(solve(XpX, t(or_ex))) @@ -250,7 +321,7 @@ compute_did <- function(data, condition_subgroup, pscores, reg_adjustment, xform # Repeated Cross-Section Functions #-------------------------------------------------- -compute_pscore_rc <- function(data, condition_subgroup, xformula) { +compute_pscore_rc <- function(data, condition_subgroup, xformula, trim_level = 0.995) { # Similar to compute_pscore but no unique(by="id") - uses all RCS data condition_data <- data[data$subgroup %in% c(condition_subgroup, 4)] condition_data[, "PA4" := ifelse(condition_data$subgroup == 4, 1, 0)] @@ -273,14 +344,41 @@ compute_pscore_rc <- function(data, condition_subgroup, xformula) { ) ) - if (!model$converged) warning(paste("RC pscore model for subgroup", condition_subgroup, "did not converge.")) - if(anyNA(model$coefficients)) stop(paste("RC pscore model coefficients for subgroup", condition_subgroup, "have NA components. Multicollinearity of covariates is a likely reason. ")) + # Flag for convergence of glm + if (!model$converged) { + warning(paste0( + "Propensity score model did not converge.\n", + " Comparison: ", get_comparison_description(condition_subgroup), " units.\n", + " Consider using fewer covariates or checking for separation issues." + )) + } + + # Flag for weird coefficients + if(anyNA(model$coefficients)) { + stop(paste0( + "Propensity score model has NA coefficients.\n", + " Comparison: ", get_comparison_description(condition_subgroup), " units.\n", + " This is likely due to multicollinearity among covariates." + )) + } # Compute propensity scores using fitted values propensity_scores <- fitted(model) - if (any(propensity_scores < 5e-8)) warning(paste("Propensity scores for comparison subgroup", condition_subgroup, "have poor overlap.")) - propensity_scores <- pmin(propensity_scores, 1 - 1e-16) + # Warning for overlap condition (proportion-based check) + overlap_warning <- check_overlap_and_get_warning(propensity_scores, condition_subgroup) + if (!is.null(overlap_warning)) { + warning(overlap_warning) + } + + # Avoid divide by zero (following DRDID approach) + propensity_scores <- pmin(propensity_scores, 1 - 1e-6) + + # Trimming indicator (following DRDID approach) + # - Treated units (PA4=1): always included (trim.ps = TRUE) + # - Control units (PA4=0): excluded if pscore >= trim_level + keep_ps <- (propensity_scores < 1.01) # Always TRUE initially + keep_ps[PA4 == 0] <- (propensity_scores[PA4 == 0] < trim_level) # Compute Hessian matrix manually (following DRDID approach) # Hessian = chol2inv(chol(t(X) %*% (W * X))) * n @@ -288,14 +386,17 @@ compute_pscore_rc <- function(data, condition_subgroup, xformula) { W <- propensity_scores * (1 - propensity_scores) * i_weights hessian_matrix <- chol2inv(chol(t(covX) %*% (W * covX))) * n - return(list(propensity_scores = propensity_scores, hessian_matrix = hessian_matrix)) + return(list(propensity_scores = propensity_scores, hessian_matrix = hessian_matrix, keep_ps = keep_ps)) } compute_pscore_null_rc <- function(data, condition_subgroup) { condition_data <- data[data$subgroup %in% c(condition_subgroup, 4)] - propensity_scores <- rep(1, nrow(condition_data)) + n <- nrow(condition_data) + propensity_scores <- rep(1, n) hessian_matrix <- NA - return(list(propensity_scores = propensity_scores, hessian_matrix = hessian_matrix)) + # No trimming needed for REG method - all units included + keep_ps <- rep(TRUE, n) + return(list(propensity_scores = propensity_scores, hessian_matrix = hessian_matrix, keep_ps = keep_ps)) } compute_outcome_regression_rc <- function(data, condition_subgroup, xformula){ @@ -321,7 +422,15 @@ compute_outcome_regression_rc <- function(data, condition_subgroup, xformula){ ) ) - if(anyNA(reg.coeff)) stop(paste("Outcome regression coefficients NA for subgroup", subg, "post", pst)) + if(anyNA(reg.coeff)) { + subg_desc <- if(subg == 4) "Treated-Eligible" else get_comparison_description(condition_subgroup) + period_desc <- if(pst == 1) "post-treatment" else "pre-treatment" + stop(paste0( + "Outcome regression model has NA coefficients.\n", + " Cell: ", subg_desc, " units in ", period_desc, " period.\n", + " This is likely due to multicollinearity among covariates." + )) + } # Predict for ALL cov_all <- stats::model.matrix(as.formula(xformula), data = condition_data) @@ -350,6 +459,7 @@ compute_did_rc <- function(data, condition_subgroup, pscores, reg_adjustment, xf pscore <- pscores[[idx]]$propensity_scores hessian <- pscores[[idx]]$hessian_matrix + keep_ps <- pscores[[idx]]$keep_ps mu <- reg_adjustment[[idx]] # Basic variables @@ -371,11 +481,11 @@ compute_did_rc <- function(data, condition_subgroup, pscores, reg_adjustment, xf #----------------------------------------- # IPW Estimator #----------------------------------------- - # Riesz representers - riesz_treat_pre <- i_weights * PA4 * (1 - post) - riesz_treat_post <- i_weights * PA4 * post - riesz_control_pre <- i_weights * pscore * PAa * (1 - post) / (1 - pscore) - riesz_control_post <- i_weights * pscore * PAa * post / (1 - pscore) + # Riesz representers (with trimming applied following DRDID approach) + riesz_treat_pre <- keep_ps * i_weights * PA4 * (1 - post) + riesz_treat_post <- keep_ps * i_weights * PA4 * post + riesz_control_pre <- keep_ps * i_weights * pscore * PAa * (1 - post) / (1 - pscore) + riesz_control_post <- keep_ps * i_weights * pscore * PAa * post / (1 - pscore) # Elements of the influence function (summands) eta_treat_pre <- riesz_treat_pre * y / mean(riesz_treat_pre) @@ -423,10 +533,10 @@ compute_did_rc <- function(data, condition_subgroup, pscores, reg_adjustment, xf or_ctrl_pre <- mu$or_ctrl_pre or_ctrl_post <- mu$or_ctrl_post - # Riesz representers - riesz_treat_pre <- i_weights * PA4 * (1 - post) - riesz_treat_post <- i_weights * PA4 * post - riesz_control <- i_weights * PA4 + # Riesz representers (with trimming applied following DRDID approach) + riesz_treat_pre <- keep_ps * i_weights * PA4 * (1 - post) + riesz_treat_post <- keep_ps * i_weights * PA4 * post + riesz_control <- keep_ps * i_weights * PA4 reg_att_treat_pre <- riesz_treat_pre * y reg_att_treat_post <- riesz_treat_post * y @@ -445,6 +555,9 @@ compute_did_rc <- function(data, condition_subgroup, pscores, reg_adjustment, xf wols_x_pre <- weights_ols_pre * covX wols_eX_pre <- weights_ols_pre * (y - or_ctrl_pre) * covX XpX_pre <- base::crossprod(wols_x_pre, covX) / n + if (base::rcond(XpX_pre) < .Machine$double.eps) { + stop("The regression design matrix for pre-treatment is singular. Consider removing some covariates.") + } XpX_inv_pre <- solve(XpX_pre) asy_lin_rep_ols_pre <- wols_eX_pre %*% XpX_inv_pre @@ -453,6 +566,9 @@ compute_did_rc <- function(data, condition_subgroup, pscores, reg_adjustment, xf wols_x_post <- weights_ols_post * covX wols_eX_post <- weights_ols_post * (y - or_ctrl_post) * covX XpX_post <- base::crossprod(wols_x_post, covX) / n + if (base::rcond(XpX_post) < .Machine$double.eps) { + stop("The regression design matrix for post-treatment is singular. Consider removing some covariates.") + } XpX_inv_post <- solve(XpX_post) asy_lin_rep_ols_post <- wols_eX_post %*% XpX_inv_post @@ -481,15 +597,15 @@ compute_did_rc <- function(data, condition_subgroup, pscores, reg_adjustment, xf or_trt_pre <- mu$or_trt_pre or_trt_post <- mu$or_trt_post - # Riesz representers - riesz_treat_pre <- i_weights * PA4 * (1 - post) - riesz_treat_post <- i_weights * PA4 * post - riesz_control_pre <- i_weights * pscore * PAa * (1 - post) / (1 - pscore) - riesz_control_post <- i_weights * pscore * PAa * post / (1 - pscore) + # Riesz representers (with trimming applied following DRDID approach) + riesz_treat_pre <- keep_ps * i_weights * PA4 * (1 - post) + riesz_treat_post <- keep_ps * i_weights * PA4 * post + riesz_control_pre <- keep_ps * i_weights * pscore * PAa * (1 - post) / (1 - pscore) + riesz_control_post <- keep_ps * i_weights * pscore * PAa * post / (1 - pscore) - riesz_d <- i_weights * PA4 - riesz_dt1 <- i_weights * PA4 * post - riesz_dt0 <- i_weights * PA4 * (1 - post) + riesz_d <- keep_ps * i_weights * PA4 + riesz_dt1 <- keep_ps * i_weights * PA4 * post + riesz_dt0 <- keep_ps * i_weights * PA4 * (1 - post) # Elements of the influence function (summands) eta_treat_pre <- riesz_treat_pre * (y - or_ctrl) / mean(riesz_treat_pre) @@ -525,6 +641,9 @@ compute_did_rc <- function(data, condition_subgroup, pscores, reg_adjustment, xf wols_x_pre <- weights_ols_pre * covX wols_eX_pre <- weights_ols_pre * (y - or_ctrl_pre) * covX XpX_pre <- base::crossprod(wols_x_pre, covX) / n + if (base::rcond(XpX_pre) < .Machine$double.eps) { + stop("The regression design matrix for control pre-treatment is singular. Consider removing some covariates.") + } XpX_inv_pre <- solve(XpX_pre) asy_lin_rep_ols_pre <- wols_eX_pre %*% XpX_inv_pre @@ -533,6 +652,9 @@ compute_did_rc <- function(data, condition_subgroup, pscores, reg_adjustment, xf wols_x_post <- weights_ols_post * covX wols_eX_post <- weights_ols_post * (y - or_ctrl_post) * covX XpX_post <- base::crossprod(wols_x_post, covX) / n + if (base::rcond(XpX_post) < .Machine$double.eps) { + stop("The regression design matrix for control post-treatment is singular. Consider removing some covariates.") + } XpX_inv_post <- solve(XpX_post) asy_lin_rep_ols_post <- wols_eX_post %*% XpX_inv_post @@ -541,6 +663,9 @@ compute_did_rc <- function(data, condition_subgroup, pscores, reg_adjustment, xf wols_x_pre_treat <- weights_ols_pre_treat * covX wols_eX_pre_treat <- weights_ols_pre_treat * (y - or_trt_pre) * covX XpX_pre_treat <- base::crossprod(wols_x_pre_treat, covX) / n + if (base::rcond(XpX_pre_treat) < .Machine$double.eps) { + stop("The regression design matrix for treated pre-treatment is singular. Consider removing some covariates.") + } XpX_inv_pre_treat <- solve(XpX_pre_treat) asy_lin_rep_ols_pre_treat <- wols_eX_pre_treat %*% XpX_inv_pre_treat @@ -549,6 +674,9 @@ compute_did_rc <- function(data, condition_subgroup, pscores, reg_adjustment, xf wols_x_post_treat <- weights_ols_post_treat * covX wols_eX_post_treat <- weights_ols_post_treat * (y - or_trt_post) * covX XpX_post_treat <- base::crossprod(wols_x_post_treat, covX) / n + if (base::rcond(XpX_post_treat) < .Machine$double.eps) { + stop("The regression design matrix for treated post-treatment is singular. Consider removing some covariates.") + } XpX_inv_post_treat <- solve(XpX_post_treat) asy_lin_rep_ols_post_treat <- wols_eX_post_treat %*% XpX_inv_post_treat diff --git a/R/preprocess.R b/R/preprocess.R index 01505f4..6440c57 100755 --- a/R/preprocess.R +++ b/R/preprocess.R @@ -3,6 +3,60 @@ #' @import BMisc #' @export NULL +#-------------------------------------------------- +# Helper function to check partition-specific collinearity +# Returns a list with: +# - collinear_vars: named list mapping dropped variables to the partition(s) where they're collinear +# - all_collinear: vector of all variables that should be dropped globally +check_partition_collinearity <- function(data, subgroup_col, cov_cols, tol = 1e-6) { + # Get unique subgroups (should be 1, 2, 3, 4) + subgroups <- sort(unique(data[[subgroup_col]])) + + # We check comparisons: 4 vs 3, 4 vs 2, 4 vs 1 + # These are the partitions used in the DDD estimation + comparison_groups <- subgroups[subgroups < 4] + + # Track which variables are collinear in which partition + partition_collinear <- list() + + for (comp_group in comparison_groups) { + # Subset data to subgroup 4 and the comparison group + subset_data <- data[data[[subgroup_col]] %in% c(4, comp_group), ] + + # Get covariate matrix for this subset + cov_subset <- as.matrix(subset_data[, cov_cols, with = FALSE]) + + # Skip if no observations (shouldn't happen, but be safe) + if (nrow(cov_subset) == 0) next + + # Use QR decomposition to detect collinearity + qr_subset <- qr(cov_subset, tol = tol) + rank_subset <- qr_subset$rank + + # Get indices of linearly independent columns + non_collinear_indices <- qr_subset$pivot[seq_len(rank_subset)] + + # Find collinear variables in this subset + collinear_in_subset <- setdiff(cov_cols, cov_cols[non_collinear_indices]) + + if (length(collinear_in_subset) > 0) { + partition_name <- paste0("subgroup 4 vs ", comp_group) + for (var in collinear_in_subset) { + if (is.null(partition_collinear[[var]])) { + partition_collinear[[var]] <- partition_name + } else { + partition_collinear[[var]] <- c(partition_collinear[[var]], partition_name) + } + } + } + } + + return(list( + collinear_vars = partition_collinear, + all_collinear = names(partition_collinear) + )) +} + #-------------------------------------------------- # Function to pre-process the data to use on ddd estimator @@ -253,6 +307,11 @@ run_nopreprocess_2periods <- function(yname, rank_m <- qr_m$rank # Get the indices of the non-collinear columns non_collinear_indices <- qr_m$pivot[seq_len(rank_m)] + # Check if any covariates were dropped due to global collinearity (following DRDID approach) + dropped_covariates_global <- setdiff(colnames(cov_m), colnames(cov_m)[non_collinear_indices]) + if (length(dropped_covariates_global) > 0) { + warning("The following covariates were dropped due to global collinearity: ", paste(dropped_covariates_global, collapse = ", ")) + } # Drop the collinear columns from the data.table cleaned_data <- cleaned_data[, c(seq(1,idx_static_vars,1), non_collinear_indices + idx_static_vars), with = FALSE] @@ -260,6 +319,36 @@ run_nopreprocess_2periods <- function(yname, #cleaned_data[, (idx_static_vars+1) := NULL] cleaned_data[, "(Intercept)" := NULL] + # Check for partition-specific collinearity (after global check) + # Get remaining covariate column names (excluding static vars) + cov_cols_remaining <- setdiff(names(cleaned_data), c("id", "y", "post", "treat", "period", "partition", "weights", "cluster", "subgroup")) + if (length(cov_cols_remaining) > 0) { + partition_check <- check_partition_collinearity(cleaned_data, "subgroup", cov_cols_remaining) + + if (length(partition_check$all_collinear) > 0) { + # Build informative warning message + partition_warnings <- sapply(names(partition_check$collinear_vars), function(var) { + partitions <- paste(partition_check$collinear_vars[[var]], collapse = ", ") + paste0(" - ", var, " (collinear in: ", partitions, ")") + }) + warning("The following covariates were dropped due to partition-specific collinearity:\n", + paste(partition_warnings, collapse = "\n")) + + # Drop these covariates globally + cols_to_keep <- setdiff(names(cleaned_data), partition_check$all_collinear) + cleaned_data <- cleaned_data[, ..cols_to_keep] + } + } + + # Update xformula to reflect dropped covariates (following DRDID approach) + # Get final covariate column names after all collinearity checks + final_cov_cols <- setdiff(names(cleaned_data), c("id", "y", "post", "treat", "period", "partition", "weights", "cluster", "subgroup")) + if (length(final_cov_cols) > 0) { + xformla <- stats::as.formula(paste("~", paste(final_cov_cols, collapse = " + "))) + } else { + xformla <- ~1 + } + out <- list(preprocessed_data = cleaned_data, xformula = xformla, est_method = est_method, @@ -603,12 +692,47 @@ run_preprocess_2Periods <- function(yname, rank_m <- qr_m$rank # Get the indices of the non-collinear columns non_collinear_indices <- qr_m$pivot[seq_len(rank_m)] + # Check if any covariates were dropped due to global collinearity (following DRDID approach) + dropped_covariates_global <- setdiff(colnames(cov_m), colnames(cov_m)[non_collinear_indices]) + if (length(dropped_covariates_global) > 0) { + warning("The following covariates were dropped due to global collinearity: ", paste(dropped_covariates_global, collapse = ", ")) + } # Drop the collinear columns from the data.table cleaned_data <- cleaned_data[, c(seq(1,idx_static_vars,1), non_collinear_indices + idx_static_vars), with = FALSE] # drop the intercept cleaned_data[, "(Intercept)" := NULL] + # Check for partition-specific collinearity (after global check) + # Get remaining covariate column names (excluding static vars) + cov_cols_remaining <- setdiff(names(cleaned_data), c("id", "y", "post", "treat", "period", "partition", "weights", "cluster", "subgroup")) + if (length(cov_cols_remaining) > 0) { + partition_check <- check_partition_collinearity(cleaned_data, "subgroup", cov_cols_remaining) + + if (length(partition_check$all_collinear) > 0) { + # Build informative warning message + partition_warnings <- sapply(names(partition_check$collinear_vars), function(var) { + partitions <- paste(partition_check$collinear_vars[[var]], collapse = ", ") + paste0(" - ", var, " (collinear in: ", partitions, ")") + }) + warning("The following covariates were dropped due to partition-specific collinearity:\n", + paste(partition_warnings, collapse = "\n")) + + # Drop these covariates globally + cols_to_keep <- setdiff(names(cleaned_data), partition_check$all_collinear) + cleaned_data <- cleaned_data[, ..cols_to_keep] + } + } + + # Update xformula to reflect dropped covariates (following DRDID approach) + # Get final covariate column names after all collinearity checks + final_cov_cols <- setdiff(names(cleaned_data), c("id", "y", "post", "treat", "period", "partition", "weights", "cluster", "subgroup")) + if (length(final_cov_cols) > 0) { + xformla <- stats::as.formula(paste("~", paste(final_cov_cols, collapse = " + "))) + } else { + xformla <- ~1 + } + out <- list(preprocessed_data = cleaned_data, xformula = xformla, tname = tname, @@ -1037,9 +1161,45 @@ run_preprocess_multPeriods <- function(yname, data = dta, na.action = na.pass)) } + + # Remove collinear variables (following DRDID approach) + # Determine the number of static columns (before covariates) + # Static cols: id, y, first_treat, period, partition, weights, [cluster] + idx_static_vars <- ifelse(is.null(cluster), 6, 7) + + # Convert the covariate columns (including intercept) to a matrix + cov_m <- as.matrix(cleaned_data[, -c(1:idx_static_vars), with = FALSE]) + + # Only check for collinearity if there are covariates beyond the intercept + if (ncol(cov_m) > 1) { + # Use the qr() function to detect collinear columns + qr_m <- qr(cov_m, tol = 1e-6) + # Get the rank of the matrix + rank_m <- qr_m$rank + # Get the indices of the non-collinear columns + non_collinear_indices <- qr_m$pivot[seq_len(rank_m)] + # Check if any covariates were dropped due to global collinearity + dropped_covariates_global <- setdiff(colnames(cov_m), colnames(cov_m)[non_collinear_indices]) + if (length(dropped_covariates_global) > 0) { + warning("The following covariates were dropped due to global collinearity: ", paste(dropped_covariates_global, collapse = ", ")) + } + # Drop the collinear columns from the data.table + cleaned_data <- cleaned_data[, c(seq(1, idx_static_vars, 1), non_collinear_indices + idx_static_vars), with = FALSE] + } + # drop the intercept cleaned_data[, "(Intercept)" := NULL] + # Update xformula to reflect dropped covariates (following DRDID approach) + # Get final covariate column names after collinearity check + # Static cols in multi-period: id, y, first_treat, period, partition, weights, [cluster] + final_cov_cols <- setdiff(names(cleaned_data), c("id", "y", "first_treat", "period", "partition", "weights", "cluster")) + if (length(final_cov_cols) > 0) { + xformla <- stats::as.formula(paste("~", paste(final_cov_cols, collapse = " + "))) + } else { + xformla <- ~1 + } + # order dataset wrt idname and tname setorder(cleaned_data, "id", "period") diff --git a/README.md b/README.md index 5727486..d450881 100755 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ # Triple Differences Estimators ![](https://img.shields.io/badge/release%20lifecycle-alpha-orange.svg) -[![](https://img.shields.io/badge/devel%20version-0.2.0-blue.svg)](https://github.com/marcelortizv/triplediff) +[![](https://img.shields.io/badge/devel%20version-0.2.1-blue.svg)](https://github.com/marcelortizv/triplediff) [![](https://img.shields.io/badge/doi-10.48550/arXiv.2505.09942-yellow.svg)](https://doi.org/10.48550/arXiv.2505.09942) diff --git a/man/figures/README-unnamed-chunk-10-1.png b/man/figures/README-unnamed-chunk-10-1.png index 96588fb..e468c6d 100644 Binary files a/man/figures/README-unnamed-chunk-10-1.png and b/man/figures/README-unnamed-chunk-10-1.png differ diff --git a/tests/testthat/test-preprocess.R b/tests/testthat/test-preprocess.R index 358d4a9..cfbc99b 100755 --- a/tests/testthat/test-preprocess.R +++ b/tests/testthat/test-preprocess.R @@ -89,11 +89,11 @@ test_that("Testing error handling in run_preprocess_2periods() function", { inffunc = FALSE, skip_data_checks = FALSE)) # Test 5: error for missing values in treatment variable "gname" - expect_error(ddd(yname = "outcome", tname = "year", idname = "id", gname = "treat", + expect_error(suppressWarnings(ddd(yname = "outcome", tname = "year", idname = "id", gname = "treat", pname = "partition", xformla = ~x1 + x2, data = missing_values_treat_df, control_group = "nevertreated", base_period = NULL, est_method = "dr", weightsname = NULL, boot = FALSE, nboot = NULL, - inffunc = FALSE, skip_data_checks = FALSE)) + inffunc = FALSE, skip_data_checks = FALSE))) # Test 6: error for small groups for inference (e.g. only one treated unit) expect_error(ddd(yname = "outcome", tname = "year", idname = "id", gname = "treat", @@ -166,11 +166,11 @@ test_that("Testing error handling in run_preprocess_2periods() function", { inffunc = FALSE, skip_data_checks = FALSE)) # Test 16: More that 2 time periods - expect_error(ddd(yname = "outcome", tname = "year", idname = "id", gname = "treat", + expect_error(suppressWarnings(ddd(yname = "outcome", tname = "year", idname = "id", gname = "treat", pname = "partition", xformla = ~x1 + x2, data = more_than_two_periods_df, control_group = "nevertreated", base_period = NULL, est_method = "dr", weightsname = NULL, boot = FALSE, nboot = NULL, - inffunc = FALSE, skip_data_checks = FALSE)) + inffunc = FALSE, skip_data_checks = FALSE))) # Test 17: Partition is not numeric expect_error(ddd(yname = "outcome", tname = "year", idname = "id", gname = "treat", @@ -194,3 +194,1290 @@ test_that("Testing error handling in run_preprocess_2periods() function", { inffunc = FALSE, skip_data_checks = FALSE)) }) + +# Tests for collinearity handling + +test_that("check_partition_collinearity helper function detects global collinearity", { + # Test the helper function directly with globally collinear variables + library(data.table) + + # Create test data with known collinearity (x3 = x1 exactly) + set.seed(789) + n <- 100 + test_data <- data.table( + subgroup = c(rep(4, 25), rep(3, 25), rep(2, 25), rep(1, 25)), + x1 = rnorm(n), + x2 = rnorm(n) + ) + + # Add a variable that's collinear with x1 (globally collinear) + test_data$x3 <- test_data$x1 + + # Test the function + result <- triplediff:::check_partition_collinearity( + test_data, + "subgroup", + c("x1", "x2", "x3") + ) + + # Should detect collinearity in all partitions (since x3 = x1 everywhere) + expect_true(length(result$all_collinear) > 0) + expect_true("x3" %in% result$all_collinear) +}) + +test_that("check_partition_collinearity detects subset-specific collinearity", { + # Test the helper function with subset-specific collinearity + # x3 is a linear combination of x1 only within subgroups 4 and 3 + library(data.table) + + set.seed(999) + n <- 200 + + # Create test data + test_data <- data.table( + subgroup = c(rep(4, 50), rep(3, 50), rep(2, 50), rep(1, 50)), + x1 = rnorm(n), + x2 = rnorm(n) + ) + + # Add x3 that's collinear with x1 only in subgroups 4 and 3 + test_data$x3 <- rnorm(n) + # Make x3 = 2*x1 only in subgroups 4 and 3 (perfect collinearity) + test_data[subgroup %in% c(4, 3), x3 := 2 * x1] + + # Test the function + result <- triplediff:::check_partition_collinearity( + test_data, + "subgroup", + c("x1", "x2", "x3") + ) + + # Should detect collinearity in "subgroup 4 vs 3" + expect_true("x3" %in% result$all_collinear) + expect_true(any(grepl("subgroup 4 vs 3", result$collinear_vars$x3))) +}) + +test_that("check_partition_collinearity returns empty when no collinearity", { + # Test the helper function with independent variables + library(data.table) + + set.seed(111) + n <- 100 + test_data <- data.table( + subgroup = c(rep(4, 25), rep(3, 25), rep(2, 25), rep(1, 25)), + x1 = rnorm(n), + x2 = rnorm(n), + x3 = rnorm(n) + ) + + # Test the function - should find no collinearity + result <- triplediff:::check_partition_collinearity( + test_data, + "subgroup", + c("x1", "x2", "x3") + ) + + # Should NOT detect any collinearity + expect_equal(length(result$all_collinear), 0) +}) + +test_that("check_partition_collinearity identifies correct partition", { + # Test that the function correctly identifies which partition has collinearity + library(data.table) + + set.seed(222) + n <- 200 + + # Create test data + test_data <- data.table( + subgroup = c(rep(4, 50), rep(3, 50), rep(2, 50), rep(1, 50)), + x1 = rnorm(n), + x2 = rnorm(n) + ) + + # Add x3 that's collinear with x1 ONLY in subgroups 4 and 2 (not 3 or 1) + test_data$x3 <- rnorm(n) + test_data[subgroup %in% c(4, 2), x3 := 3 * x1] + + # Test the function + result <- triplediff:::check_partition_collinearity( + test_data, + "subgroup", + c("x1", "x2", "x3") + ) + + # Should detect collinearity in "subgroup 4 vs 2" + expect_true("x3" %in% result$all_collinear) + expect_true(any(grepl("subgroup 4 vs 2", result$collinear_vars$x3))) +}) + +test_that("check_partition_collinearity detects constant variables (collinear with intercept)", { + # IMPORTANT: A constant variable is collinear with the intercept + # This is a key case that must be detected + library(data.table) + + set.seed(333) + n <- 200 + + # Create test data with an intercept column + test_data <- data.table( + subgroup = c(rep(4, 50), rep(3, 50), rep(2, 50), rep(1, 50)), + intercept = rep(1, n), # Intercept column (all 1s) + x1 = rnorm(n), + x2 = rnorm(n) + ) + + # Add x3 that's constant only in subgroups 4 and 3 (collinear with intercept in that partition) + test_data$x3 <- rnorm(n) + test_data[subgroup %in% c(4, 3), x3 := 5] # Constant value in 4 vs 3 comparison + + # Test the function with intercept included + result <- triplediff:::check_partition_collinearity( + test_data, + "subgroup", + c("intercept", "x1", "x2", "x3") + ) + + # Should detect that x3 is collinear with the intercept in "subgroup 4 vs 3" + expect_true("x3" %in% result$all_collinear) + expect_true(any(grepl("subgroup 4 vs 3", result$collinear_vars$x3))) +}) + +test_that("check_partition_collinearity detects globally constant variables", { + # A globally constant variable should be detected in ALL partitions + library(data.table) + + set.seed(444) + n <- 200 + + # Create test data with an intercept column + test_data <- data.table( + subgroup = c(rep(4, 50), rep(3, 50), rep(2, 50), rep(1, 50)), + intercept = rep(1, n), # Intercept column (all 1s) + x1 = rnorm(n), + x2 = rnorm(n), + x3 = rep(5, n) # Globally constant - should be collinear with intercept everywhere + ) + + # Test the function + result <- triplediff:::check_partition_collinearity( + test_data, + "subgroup", + c("intercept", "x1", "x2", "x3") + ) + + # Should detect that x3 is collinear with the intercept in ALL partitions + expect_true("x3" %in% result$all_collinear) + # Should be detected in all three comparisons (4 vs 3, 4 vs 2, 4 vs 1) + expect_true(length(result$collinear_vars$x3) == 3) +}) + +# ============================================================================= +# Propensity Score Trimming Tests +# ============================================================================= + +test_that("compute_pscore returns keep_ps indicator for trimming", { + # Test that compute_pscore returns keep_ps and that it follows DRDID approach + library(data.table) + + # Generate test data with panel structure + set.seed(555) + n <- 200 + + # Create panel data with 2 time periods + test_data <- data.table( + id = rep(1:n, each = 2), + post = rep(c(0, 1), n), + subgroup = rep(c(rep(4, n/2), rep(3, n/2)), each = 2), + y = rnorm(n * 2), + weights = rep(1, n * 2), + x1 = rep(rnorm(n), each = 2), + x2 = rep(rnorm(n), each = 2) + ) + + # Call compute_pscore + result <- triplediff:::compute_pscore( + data = test_data, + condition_subgroup = 3, + xformula = ~ x1 + x2, + trim_level = 0.995 + ) + + # Should return propensity_scores, hessian_matrix, and keep_ps + + expect_true("propensity_scores" %in% names(result)) + expect_true("hessian_matrix" %in% names(result)) + expect_true("keep_ps" %in% names(result)) + + # keep_ps should be a logical vector of same length as propensity_scores + expect_equal(length(result$keep_ps), length(result$propensity_scores)) + expect_true(is.logical(result$keep_ps)) +}) + +test_that("compute_pscore_null returns keep_ps = TRUE for all units (REG method)", { + # REG method doesn't use IPW, so no trimming is needed + library(data.table) + + set.seed(666) + n <- 100 + + test_data <- data.table( + id = rep(1:n, each = 2), + post = rep(c(0, 1), n), + subgroup = rep(c(rep(4, n/2), rep(3, n/2)), each = 2), + y = rnorm(n * 2), + weights = rep(1, n * 2) + ) + + result <- triplediff:::compute_pscore_null( + data = test_data, + condition_subgroup = 3 + ) + + # Should return keep_ps = TRUE for all units + expect_true("keep_ps" %in% names(result)) + expect_true(all(result$keep_ps == TRUE)) +}) + +test_that("propensity score trimming excludes controls with ps >= 0.995", { + # Test that control units with high propensity scores are trimmed + library(data.table) + + # Create data where some controls will have very high propensity scores + # by making their covariates similar to treated units + set.seed(777) + n_treated <- 100 + n_control <- 100 + + # Treated units (subgroup 4) have x1 ~ N(3, 0.5) - strongly positive + treated_x1 <- rnorm(n_treated, mean = 3, sd = 0.5) + + # Controls: most have x1 ~ N(-1, 1), but some have x1 ~ N(3, 0.1) (almost like treated) + n_high_ps_controls <- 10 + control_x1 <- c( + rnorm(n_control - n_high_ps_controls, mean = -1, sd = 1), # Normal controls + rnorm(n_high_ps_controls, mean = 3, sd = 0.1) # "Almost treated" controls + ) + + test_data <- data.table( + id = rep(1:(n_treated + n_control), each = 2), + post = rep(c(0, 1), n_treated + n_control), + subgroup = rep(c(rep(4, n_treated), rep(3, n_control)), each = 2), + y = rnorm((n_treated + n_control) * 2), + weights = rep(1, (n_treated + n_control) * 2), + x1 = rep(c(treated_x1, control_x1), each = 2) + ) + + result <- suppressWarnings(triplediff:::compute_pscore( + data = test_data, + condition_subgroup = 3, + xformula = ~ x1, + trim_level = 0.995 + )) + + # Get unique data to match propensity scores + uid_data <- unique(test_data, by = "id") + PA4 <- ifelse(uid_data$subgroup == 4, 1, 0) + + # Treated units (PA4 = 1) should all have keep_ps = TRUE + expect_true(all(result$keep_ps[PA4 == 1] == TRUE)) + + # Some control units should be trimmed (have keep_ps = FALSE) + # due to their high propensity scores + control_keep_ps <- result$keep_ps[PA4 == 0] + control_ps <- result$propensity_scores[PA4 == 0] + + # Verify trimming logic: keep_ps = FALSE when ps >= 0.995 + expect_true(all(control_keep_ps[control_ps >= 0.995] == FALSE)) + expect_true(all(control_keep_ps[control_ps < 0.995] == TRUE)) +}) + +test_that("propensity scores are capped at 1 - 1e-6 (following DRDID)", { + # Test that propensity scores are bounded away from 1 + library(data.table) + + # Create extreme separation case + set.seed(888) + n_treated <- 50 + n_control <- 50 + + # Perfect separation: treated have x1 > 0, controls have x1 < 0 + treated_x1 <- runif(n_treated, min = 5, max = 10) + control_x1 <- runif(n_control, min = -10, max = -5) + + test_data <- data.table( + id = rep(1:(n_treated + n_control), each = 2), + post = rep(c(0, 1), n_treated + n_control), + subgroup = rep(c(rep(4, n_treated), rep(3, n_control)), each = 2), + y = rnorm((n_treated + n_control) * 2), + weights = rep(1, (n_treated + n_control) * 2), + x1 = rep(c(treated_x1, control_x1), each = 2) + ) + + result <- suppressWarnings( + triplediff:::compute_pscore( + data = test_data, + condition_subgroup = 3, + xformula = ~ x1, + trim_level = 0.995 + ) + ) + + # All propensity scores should be <= 1 - 1e-6 + expect_true(all(result$propensity_scores <= 1 - 1e-6)) + + # And > 0 (lower bound) + expect_true(all(result$propensity_scores > 0)) +}) + +test_that("trimming is applied to ATT weights in compute_did", { + # Integration test: verify that trimmed units get zero weight + library(data.table) + + # Generate data where we know some controls will be trimmed + set.seed(999) + n_treated <- 80 + n_control <- 80 + + # Create strong separation for some controls + treated_x1 <- rnorm(n_treated, mean = 2, sd = 0.5) + n_high_ps <- 5 + control_x1 <- c( + rnorm(n_control - n_high_ps, mean = -2, sd = 1), + rnorm(n_high_ps, mean = 2, sd = 0.1) # These should have high ps + ) + + test_data <- data.table( + id = rep(1:(n_treated + n_control), each = 2), + post = rep(c(0, 1), n_treated + n_control), + subgroup = rep(c(rep(4, n_treated), rep(3, n_control)), each = 2), + y = rnorm((n_treated + n_control) * 2), + weights = rep(1, (n_treated + n_control) * 2), + x1 = rep(c(treated_x1, control_x1), each = 2) + ) + + # Compute propensity scores + pscores_result <- suppressWarnings(triplediff:::compute_pscore( + data = test_data, + condition_subgroup = 3, + xformula = ~ x1, + trim_level = 0.995 + )) + + # Create pscores list structure as expected by compute_did + pscores <- list(pscores_result) # Index 1 for subgroup 3 + + # Compute outcome regression (simple version) + reg_result <- triplediff:::compute_outcome_regression( + data = test_data, + condition_subgroup = 3, + xformula = ~ x1 + ) + reg_adjustment <- list(reg_result) + + # Add deltaY to unique data + uid_data <- unique(test_data, by = "id") + uid_data[, subgroup := subgroup] + + # The compute_did function should use keep_ps to zero out trimmed weights + # This is an indirect test - we verify the propensity score list has trimming info + expect_true("keep_ps" %in% names(pscores[[1]])) + + # Verify some controls are trimmed + uid_controls <- uid_data[subgroup == 3] + control_ps <- pscores_result$propensity_scores[uid_data$subgroup == 3] + control_keep <- pscores_result$keep_ps[uid_data$subgroup == 3] + + # If any controls have high ps, they should be trimmed + high_ps_controls <- control_ps >= 0.995 + if (any(high_ps_controls)) { + expect_true(all(control_keep[high_ps_controls] == FALSE)) + } +}) + +test_that("confidence interval coverage is valid after trimming", { + # Monte Carlo test: verify that CI coverage is within 2% of nominal 95% level + # This tests that the trimming doesn't break the inference + # DGP with moderate covariate imbalance that may trigger some trimming + skip_on_cran() # Skip on CRAN due to computation time + + library(data.table) + + set.seed(1234) + n_sims <- 1000 # Number of Monte Carlo replications + true_att <- 2 # True treatment effect + alpha <- 0.05 + nominal_coverage <- 1 - alpha # 0.95 + + # Track coverage + covers <- logical(n_sims) + + for (sim in 1:n_sims) { + # Generate DDD panel data with known treatment effect + # Need units in all 4 cells: treated/control x partition=0/1 + n_per_cell <- 500 # 500 units per cell = 2000 total (larger sample for better coverage) + n_units <- 4 * n_per_cell + + # Create balanced 2x2 design + treat_group <- c(rep(1, 2*n_per_cell), rep(0, 2*n_per_cell)) + partition <- c(rep(1, n_per_cell), rep(0, n_per_cell), + rep(1, n_per_cell), rep(0, n_per_cell)) + + # Covariates with moderate difference between treated and control + # This creates propensity score variation but with good overlap + # Treated units: x1 ~ N(0.5, 1) + # Control units: most ~ N(-0.5, 1), but a few "almost treated" with extreme x1 + x1_treated <- rnorm(2 * n_per_cell, mean = 0.5, sd = 1) + + # Add ~5 "almost treated" controls per partition (10 total) with x1 ~ N(10, 0.1) + # These will have propensity scores >= 0.995 and trigger trimming + n_high_ps <- 5 + n_normal_control <- n_per_cell - n_high_ps + x1_control <- c( + rnorm(n_normal_control, mean = -0.5, sd = 1), # Normal controls, partition=1 + rnorm(n_high_ps, mean = 10, sd = 0.1), # "Almost treated" controls, partition=1 + rnorm(n_normal_control, mean = -0.5, sd = 1), # Normal controls, partition=0 + rnorm(n_high_ps, mean = 10, sd = 0.1) # "Almost treated" controls, partition=0 + ) + x1 <- c(x1_treated, x1_control) + x2 <- rnorm(n_units) + + # Create panel data (2 time periods) + panel_data <- data.table( + id = rep(1:n_units, each = 2), + year = rep(c(0, 1), n_units), + partition = rep(partition, each = 2), + treat = rep(treat_group, each = 2), + x1 = rep(x1, each = 2), + x2 = rep(x2, each = 2) + ) + + # Generate outcome with DDD structure: + # Y = base + partition_effect + time_effect + treat_effect + x_effects + DDD_effect + noise + panel_data[, outcome := ( + 1 + # base + 0.5 * partition + # partition effect + 0.3 * year + # time effect + 0.4 * treat + # treatment group effect + 0.5 * x1 + 0.3 * x2 + # covariate effects + 0.2 * partition * year + # parallel trends for partition + 0.2 * treat * year + # parallel trends for treatment + 0.1 * partition * treat + # partition-treatment interaction + # DDD effect: only for treated, post-period, partition=1 + true_att * treat * year * partition + + rnorm(.N, sd = 1) # noise + )] + + # Run the DDD estimator + result <- tryCatch({ + suppressWarnings( + ddd( + yname = "outcome", + tname = "year", + idname = "id", + gname = "treat", + pname = "partition", + xformla = ~ x1 + x2, + data = panel_data, + est_method = "dr", + boot = FALSE, + inffunc = TRUE + ) + ) + }, error = function(e) NULL) + + if (!is.null(result) && !is.null(result$ATT) && !is.null(result$se) && + !is.na(result$ATT) && !is.na(result$se)) { + # Check if true ATT is within 95% CI + # ddd returns ATT, se, lci, uci directly (not nested) + est <- result$ATT + se <- result$se + ci_lower <- est - qnorm(1 - alpha/2) * se + ci_upper <- est + qnorm(1 - alpha/2) * se + covers[sim] <- (true_att >= ci_lower) & (true_att <= ci_upper) + } else { + covers[sim] <- NA + } + } + + # Calculate coverage rate (excluding failed simulations) + valid_covers <- covers[!is.na(covers)] + coverage_rate <- mean(valid_covers) + + # Coverage should be within 2% of nominal 95% level (i.e., between 93% and 97%) + expect_true(coverage_rate >= nominal_coverage - 0.02, + info = paste("Coverage rate:", round(coverage_rate, 4), + "- expected at least", nominal_coverage - 0.02)) + expect_true(coverage_rate <= nominal_coverage + 0.02, + info = paste("Coverage rate:", round(coverage_rate, 4), + "- expected at most", nominal_coverage + 0.02)) +}) + +# ============================================================================= +# Global Collinearity Check Tests for Multiple Periods +# ============================================================================= + +test_that("run_preprocess_multPeriods detects and drops globally collinear covariates", { + # Test that globally collinear variables are detected and dropped in multi-period setting + library(data.table) + + set.seed(12345) + n_units <- 200 + n_periods <- 4 + + # Create staggered adoption panel data + # Groups: 0 (never treated), 2 (treated in period 2), 3 (treated in period 3) + group_assignment <- sample(c(0, 2, 3), n_units, replace = TRUE, prob = c(0.4, 0.3, 0.3)) + + # Create panel structure + panel_data <- data.table( + id = rep(1:n_units, each = n_periods), + year = rep(1:n_periods, n_units), + first_treat = rep(group_assignment, each = n_periods), + partition = rep(sample(0:1, n_units, replace = TRUE), each = n_periods) + ) + + # Add covariates + panel_data[, x1 := rnorm(.N)] + panel_data[, x2 := rnorm(.N)] + # x3 is perfectly collinear with x1 (x3 = 2*x1) + panel_data[, x3 := 2 * x1] + + # Generate outcome + panel_data[, outcome := 1 + 0.5 * x1 + 0.3 * x2 + rnorm(.N)] + + # Run ddd with collinear covariates - should warn about dropping x3 + + expect_warning( + result <- ddd( + yname = "outcome", + tname = "year", + idname = "id", + gname = "first_treat", + pname = "partition", + xformla = ~ x1 + x2 + x3, + data = panel_data, + control_group = "nevertreated", + base_period = "varying", + est_method = "dr", + boot = FALSE + ), + regexp = "collinearity" + ) + + # Result should still be valid (estimation should complete) + expect_true(!is.null(result)) + expect_true(inherits(result, "ddd")) +}) + +test_that("run_preprocess_multPeriods handles no collinearity case correctly", +{ + # Test that when there's no collinearity, all covariates are kept + library(data.table) + + set.seed(54321) + n_units <- 200 + n_periods <- 4 + + # Create staggered adoption panel data + group_assignment <- sample(c(0, 2, 3), n_units, replace = TRUE, prob = c(0.4, 0.3, 0.3)) + + panel_data <- data.table( + id = rep(1:n_units, each = n_periods), + year = rep(1:n_periods, n_units), + first_treat = rep(group_assignment, each = n_periods), + partition = rep(sample(0:1, n_units, replace = TRUE), each = n_periods) + ) + + # Add independent covariates (no collinearity) + panel_data[, x1 := rnorm(.N)] + panel_data[, x2 := rnorm(.N)] + panel_data[, x3 := rnorm(.N)] + + # Generate outcome + panel_data[, outcome := 1 + 0.5 * x1 + 0.3 * x2 + 0.2 * x3 + rnorm(.N)] + + # Run ddd - should NOT warn about collinearity + expect_no_warning( + result <- suppressWarnings(ddd( + yname = "outcome", + tname = "year", + idname = "id", + gname = "first_treat", + pname = "partition", + xformla = ~ x1 + x2 + x3, + data = panel_data, + control_group = "nevertreated", + base_period = "varying", + est_method = "dr", + boot = FALSE + )), + message = "collinearity" + ) + + # Result should be valid + expect_true(!is.null(result)) + expect_true(inherits(result, "ddd")) +}) + +test_that("run_preprocess_multPeriods detects constant covariate (collinear with intercept)", { + + # Test that a constant variable (collinear with intercept) is detected + library(data.table) + + set.seed(99999) + n_units <- 200 + n_periods <- 4 + + group_assignment <- sample(c(0, 2, 3), n_units, replace = TRUE, prob = c(0.4, 0.3, 0.3)) + + panel_data <- data.table( + id = rep(1:n_units, each = n_periods), + year = rep(1:n_periods, n_units), + first_treat = rep(group_assignment, each = n_periods), + partition = rep(sample(0:1, n_units, replace = TRUE), each = n_periods) + ) + + # Add covariates - x3 is constant (collinear with intercept) + panel_data[, x1 := rnorm(.N)] + panel_data[, x2 := rnorm(.N)] + panel_data[, x3 := 5] # Constant value + + # Generate outcome + panel_data[, outcome := 1 + 0.5 * x1 + 0.3 * x2 + rnorm(.N)] + + # Run ddd - should warn about dropping x3 + expect_warning( + result <- ddd( + yname = "outcome", + tname = "year", + idname = "id", + gname = "first_treat", + pname = "partition", + xformla = ~ x1 + x2 + x3, + data = panel_data, + control_group = "nevertreated", + base_period = "varying", + est_method = "dr", + boot = FALSE + ), + regexp = "collinearity" + ) + + expect_true(!is.null(result)) + expect_true(inherits(result, "ddd")) +}) + +# ============================================================================= +# Repeated Cross-Section (RCS) Propensity Score Trimming Tests +# ============================================================================= + +test_that("compute_pscore_rc returns keep_ps indicator for trimming in RCS data", { + + # Test that compute_pscore_rc returns keep_ps for repeated cross-section data + # Key difference from panel: each observation is independent (no repeated IDs) + library(data.table) + + set.seed(1111) + n_pre <- 100 # observations in pre-period + n_post <- 100 # observations in post-period + + # Create RCS data structure: different units in pre and post periods + # In RCS, each row is a unique observation with unique ID + test_data <- data.table( + id = 1:(n_pre + n_post), # Each observation has unique ID + post = c(rep(0, n_pre), rep(1, n_post)), + subgroup = c( + rep(4, n_pre/2), rep(3, n_pre/2), # Pre-period: half treated, half control + rep(4, n_post/2), rep(3, n_post/2) # Post-period: half treated, half control + ), + y = rnorm(n_pre + n_post), + weights = rep(1, n_pre + n_post), + x1 = rnorm(n_pre + n_post), + x2 = rnorm(n_pre + n_post) + ) + + # Call compute_pscore_rc + result <- triplediff:::compute_pscore_rc( + data = test_data, + condition_subgroup = 3, + xformula = ~ x1 + x2, + trim_level = 0.995 + ) + + # Should return propensity_scores, hessian_matrix, and keep_ps + expect_true("propensity_scores" %in% names(result)) + expect_true("hessian_matrix" %in% names(result)) + expect_true("keep_ps" %in% names(result)) + + # For RCS, length should match all observations in condition_data (not unique by id) + condition_data <- test_data[test_data$subgroup %in% c(3, 4)] + expect_equal(length(result$keep_ps), nrow(condition_data)) + expect_true(is.logical(result$keep_ps)) +}) + +test_that("compute_pscore_null_rc returns keep_ps = TRUE for all units (REG method in RCS)", { + # REG method doesn't use IPW, so no trimming is needed in RCS + library(data.table) + + set.seed(2222) + n <- 200 + + # Create RCS data + test_data <- data.table( + id = 1:n, + post = c(rep(0, n/2), rep(1, n/2)), + subgroup = rep(c(4, 3), n/2), + y = rnorm(n), + weights = rep(1, n) + ) + + result <- triplediff:::compute_pscore_null_rc( + data = test_data, + condition_subgroup = 3 + ) + + # Should return keep_ps = TRUE for all observations + expect_true("keep_ps" %in% names(result)) + expect_true(all(result$keep_ps == TRUE)) + + # Length should match all observations in subgroups 3 and 4 + condition_data <- test_data[test_data$subgroup %in% c(3, 4)] + expect_equal(length(result$keep_ps), nrow(condition_data)) +}) + +test_that("RCS propensity score trimming excludes controls with ps >= 0.995", { + # Test that control units with high propensity scores are trimmed in RCS + library(data.table) + + set.seed(3333) + n_treated <- 100 + n_control <- 100 + + # Treated units (subgroup 4) have x1 ~ N(3, 0.5) - strongly positive + treated_x1 <- rnorm(n_treated, mean = 3, sd = 0.5) + + # Controls: most have x1 ~ N(-1, 1), but some have x1 ~ N(3, 0.1) (almost like treated) + n_high_ps_controls <- 10 + control_x1 <- c( + rnorm(n_control - n_high_ps_controls, mean = -1, sd = 1), + rnorm(n_high_ps_controls, mean = 3, sd = 0.1) # "Almost treated" controls + ) + + # Create RCS data: each observation is independent + test_data <- data.table( + id = 1:(n_treated + n_control), + post = sample(0:1, n_treated + n_control, replace = TRUE), # Random pre/post assignment + subgroup = c(rep(4, n_treated), rep(3, n_control)), + y = rnorm(n_treated + n_control), + weights = rep(1, n_treated + n_control), + x1 = c(treated_x1, control_x1) + ) + + result <- suppressWarnings(triplediff:::compute_pscore_rc( + data = test_data, + condition_subgroup = 3, + xformula = ~ x1, + trim_level = 0.995 + )) + + # Get PA4 indicator for the condition_data + condition_data <- test_data[test_data$subgroup %in% c(3, 4)] + PA4 <- ifelse(condition_data$subgroup == 4, 1, 0) + + # Treated units (PA4 = 1) should all have keep_ps = TRUE + expect_true(all(result$keep_ps[PA4 == 1] == TRUE)) + + # Verify trimming logic: keep_ps = FALSE when ps >= 0.995 for controls + control_keep_ps <- result$keep_ps[PA4 == 0] + control_ps <- result$propensity_scores[PA4 == 0] + + expect_true(all(control_keep_ps[control_ps >= 0.995] == FALSE)) + expect_true(all(control_keep_ps[control_ps < 0.995] == TRUE)) +}) + +test_that("RCS propensity scores are capped at 1 - 1e-6 (following DRDID)", { + # Test that propensity scores are bounded away from 1 in RCS + library(data.table) + + set.seed(4444) + n_treated <- 50 + n_control <- 50 + + # Perfect separation: treated have x1 > 0, controls have x1 < 0 + treated_x1 <- runif(n_treated, min = 5, max = 10) + control_x1 <- runif(n_control, min = -10, max = -5) + + # Create RCS data + test_data <- data.table( + id = 1:(n_treated + n_control), + post = sample(0:1, n_treated + n_control, replace = TRUE), + subgroup = c(rep(4, n_treated), rep(3, n_control)), + y = rnorm(n_treated + n_control), + weights = rep(1, n_treated + n_control), + x1 = c(treated_x1, control_x1) + ) + + result <- suppressWarnings( + triplediff:::compute_pscore_rc( + data = test_data, + condition_subgroup = 3, + xformula = ~ x1, + trim_level = 0.995 + ) + ) + + # All propensity scores should be <= 1 - 1e-6 + expect_true(all(result$propensity_scores <= 1 - 1e-6)) + + # And > 0 (lower bound) + expect_true(all(result$propensity_scores > 0)) +}) + +test_that("RCS trimming works correctly with compute_did_rc for IPW method", { + # Integration test: verify trimming is applied in compute_did_rc + library(data.table) + + set.seed(5555) + n_treated <- 80 + n_control <- 80 + + # Create separation for some controls + treated_x1 <- rnorm(n_treated, mean = 2, sd = 0.5) + n_high_ps <- 5 + control_x1 <- c( + rnorm(n_control - n_high_ps, mean = -2, sd = 1), + rnorm(n_high_ps, mean = 2, sd = 0.1) # High propensity score controls + ) + + # Create RCS data with pre and post periods + n_total <- n_treated + n_control + test_data <- data.table( + id = 1:n_total, + post = c(rep(0, n_total/2), rep(1, n_total/2)), # Half pre, half post + subgroup = rep(c(rep(4, n_treated/2), rep(3, n_control/2)), 2), + y = rnorm(n_total), + weights = rep(1, n_total), + x1 = rep(c(treated_x1[1:(n_treated/2)], control_x1[1:(n_control/2)]), 2) + ) + + # Compute propensity scores for RCS + pscores_result <- suppressWarnings(triplediff:::compute_pscore_rc( + data = test_data, + condition_subgroup = 3, + xformula = ~ x1, + trim_level = 0.995 + )) + + # Verify the propensity score result has trimming info + expect_true("keep_ps" %in% names(pscores_result)) + + # Check that some controls are trimmed if they have high propensity scores + condition_data <- test_data[test_data$subgroup %in% c(3, 4)] + PA4 <- ifelse(condition_data$subgroup == 4, 1, 0) + control_ps <- pscores_result$propensity_scores[PA4 == 0] + control_keep <- pscores_result$keep_ps[PA4 == 0] + + # If any controls have high ps, they should be trimmed + high_ps_controls <- control_ps >= 0.995 + if (any(high_ps_controls)) { + expect_true(all(control_keep[high_ps_controls] == FALSE)) + } +}) + +test_that("End-to-end RCS DDD estimation with trimming works correctly", { + # Full integration test: run ddd() with panel=FALSE (RCS) and verify trimming + skip_on_cran() # Skip on CRAN due to computation time + library(data.table) + + set.seed(6666) + n_obs <- 400 # Total observations + + # Create RCS data with known structure + # In RCS, we need: treated/control x partition x pre/post + # Each observation is independent (different units) + + # Create balanced design across cells + n_per_cell <- n_obs / 8 # 8 cells: 2 treat x 2 partition x 2 time + + test_data <- data.table( + id = 1:n_obs, # Each row is unique observation + year = rep(c(1, 2), each = n_obs/2), # Pre and post periods + treat = rep(rep(c(0, 1), each = n_per_cell * 2), 2), + partition = rep(rep(c(0, 1), each = n_per_cell), 4) + ) + + # Add covariates with moderate separation + # Treated units have higher x1 on average + test_data[, x1 := ifelse(treat == 1, rnorm(.N, mean = 1, sd = 1), rnorm(.N, mean = -1, sd = 1))] + test_data[, x2 := rnorm(.N)] + + # Add a few "almost treated" controls to trigger trimming + n_high_ps <- 5 + high_ps_indices <- sample(which(test_data$treat == 0), n_high_ps) + test_data[high_ps_indices, x1 := rnorm(n_high_ps, mean = 5, sd = 0.1)] + + # Generate outcome with DDD structure + true_att <- 2 + test_data[, outcome := ( + 1 + + 0.5 * partition + + 0.3 * year + + 0.4 * treat + + 0.3 * x1 + 0.2 * x2 + + true_att * treat * year * partition + + rnorm(.N, sd = 1) + )] + + # Run DDD with panel = FALSE (RCS mode) + result <- suppressWarnings( + ddd( + yname = "outcome", + tname = "year", + idname = "id", + gname = "treat", + pname = "partition", + xformla = ~ x1 + x2, + data = test_data, + panel = FALSE, # This triggers RCS mode + est_method = "dr", + boot = FALSE + ) + ) + + # Result should be valid + expect_true(!is.null(result)) + expect_true(inherits(result, "ddd")) + expect_true(!is.na(result$ATT)) + expect_true(!is.na(result$se)) +}) + +test_that("RCS confidence interval coverage is valid after trimming", { + # Monte Carlo test: verify that CI coverage is within 2% of nominal 95% level + + # This tests that the trimming doesn't break the inference for RCS data + # DGP with moderate covariate imbalance that may trigger some trimming + skip_on_cran() # Skip on CRAN due to computation time + + library(data.table) + + set.seed(5678) + n_sims <- 1000 # Number of Monte Carlo replications + true_att <- 2 # True treatment effect + alpha <- 0.05 + nominal_coverage <- 1 - alpha # 0.95 + + # Track coverage + covers <- logical(n_sims) + + for (sim in 1:n_sims) { + # Generate DDD RCS data with known treatment effect + # In RCS, each observation is independent (different units in pre vs post) + # Need observations in all 8 cells: treated/control x partition x pre/post + n_per_cell <- 250 # 250 obs per cell = 2000 total + n_obs <- 8 * n_per_cell + + # Create balanced 2x2x2 design for RCS + # Each row is a unique observation (unique id) + rcs_data <- data.table( + id = 1:n_obs, + year = rep(c(0, 1), each = n_obs/2), + treat = rep(rep(c(0, 1), each = n_per_cell * 2), 2), + partition = rep(rep(c(0, 1), each = n_per_cell), 4) + ) + + # Covariates with moderate difference between treated and control + # Treated units: x1 ~ N(0.5, 1) + # Control units: most ~ N(-0.5, 1), but a few "almost treated" with extreme x1 + rcs_data[, x1 := ifelse(treat == 1, + rnorm(.N, mean = 0.5, sd = 1), + rnorm(.N, mean = -0.5, sd = 1))] + rcs_data[, x2 := rnorm(.N)] + + # Add ~10 "almost treated" controls with x1 ~ N(10, 0.1) + # These will have propensity scores >= 0.995 and trigger trimming + n_high_ps <- 10 + high_ps_indices <- sample(which(rcs_data$treat == 0), n_high_ps) + rcs_data[high_ps_indices, x1 := rnorm(n_high_ps, mean = 10, sd = 0.1)] + + # Generate outcome with DDD structure: + # Y = base + partition_effect + time_effect + treat_effect + x_effects + DDD_effect + noise + rcs_data[, outcome := ( + 1 + # base + 0.5 * partition + # partition effect + 0.3 * year + # time effect + 0.4 * treat + # treatment group effect + 0.5 * x1 + 0.3 * x2 + # covariate effects + 0.2 * partition * year + # parallel trends for partition + 0.2 * treat * year + # parallel trends for treatment + 0.1 * partition * treat + # partition-treatment interaction + # DDD effect: only for treated, post-period, partition=1 + true_att * treat * year * partition + + rnorm(.N, sd = 1) # noise + )] + + # Run the DDD estimator with panel = FALSE (RCS mode) + result <- tryCatch({ + suppressWarnings( + ddd( + yname = "outcome", + tname = "year", + idname = "id", + gname = "treat", + pname = "partition", + xformla = ~ x1 + x2, + data = rcs_data, + panel = FALSE, # RCS mode + est_method = "dr", + boot = FALSE, + inffunc = TRUE + ) + ) + }, error = function(e) NULL) + + if (!is.null(result) && !is.null(result$ATT) && !is.null(result$se) && + !is.na(result$ATT) && !is.na(result$se)) { + # Check if true ATT is within 95% CI + est <- result$ATT + se <- result$se + ci_lower <- est - qnorm(1 - alpha/2) * se + ci_upper <- est + qnorm(1 - alpha/2) * se + covers[sim] <- (true_att >= ci_lower) & (true_att <= ci_upper) + } else { + covers[sim] <- NA + } + } + + # Calculate coverage rate (excluding failed simulations) + valid_covers <- covers[!is.na(covers)] + coverage_rate <- mean(valid_covers) + + # Coverage should be within 2% of nominal 95% level (i.e., between 93% and 97%) + expect_true(coverage_rate >= nominal_coverage - 0.02, + info = paste("RCS Coverage rate:", round(coverage_rate, 4), + "- expected at least", nominal_coverage - 0.02)) + expect_true(coverage_rate <= nominal_coverage + 0.02, + info = paste("RCS Coverage rate:", round(coverage_rate, 4), + "- expected at most", nominal_coverage + 0.02)) +}) + +# ============================================================================= +# End-to-End Panel DDD Estimation with Trimming Tests +# ============================================================================= + +test_that("End-to-end Panel DDD estimation with trimming works correctly", { + # Full integration test: run ddd() with panel=TRUE (default) and verify trimming + skip_on_cran() # Skip on CRAN due to computation time + library(data.table) + + set.seed(7777) + n_units <- 200 # Number of unique units + + # Create panel data with known structure + # In panel data, same units are observed in pre and post periods + # Need: treated/control x partition balanced design + + n_per_cell <- n_units / 4 # 4 cells: 2 treat x 2 partition + + # Create unit-level attributes (constant across time) + unit_data <- data.table( + id = 1:n_units, + treat = rep(c(0, 1), each = n_units/2), + partition = rep(rep(c(0, 1), each = n_per_cell), 2) + ) + + # Add covariates with moderate separation (unit-level) + # Treated units have higher x1 on average + unit_data[, x1 := ifelse(treat == 1, rnorm(.N, mean = 1, sd = 1), rnorm(.N, mean = -1, sd = 1))] + unit_data[, x2 := rnorm(.N)] + + # Add a few "almost treated" controls to trigger trimming + n_high_ps <- 5 + high_ps_indices <- sample(which(unit_data$treat == 0), n_high_ps) + unit_data[high_ps_indices, x1 := rnorm(n_high_ps, mean = 5, sd = 0.1)] + + # Expand to panel data (2 time periods per unit) + test_data <- rbindlist(lapply(c(1, 2), function(t) { + dt <- copy(unit_data) + dt[, year := t] + dt + })) + setorder(test_data, id, year) + + # Generate outcome with DDD structure + true_att <- 2 + test_data[, outcome := ( + 1 + + 0.5 * partition + + 0.3 * (year - 1) + # year effect (year 1 = pre, year 2 = post) + 0.4 * treat + + 0.3 * x1 + 0.2 * x2 + + true_att * treat * (year - 1) * partition + # DDD effect only in post period + rnorm(.N, sd = 1) + )] + + # Run DDD with panel = TRUE (default, panel mode) + result <- suppressWarnings( + ddd( + yname = "outcome", + tname = "year", + idname = "id", + gname = "treat", + pname = "partition", + xformla = ~ x1 + x2, + data = test_data, + panel = TRUE, # Explicitly set panel mode + est_method = "dr", + boot = FALSE + ) + ) + + # Result should be valid + expect_true(!is.null(result)) + expect_true(inherits(result, "ddd")) + expect_true(!is.na(result$ATT)) + expect_true(!is.na(result$se)) + + # ATT should be reasonably close to true value (within 3 SE) + # This is a sanity check, not a strict test + expect_true(abs(result$ATT - true_att) < 3 * result$se, + info = paste("ATT:", round(result$ATT, 3), "SE:", round(result$se, 3))) +}) + +test_that("End-to-end Panel DDD estimation with IPW method and trimming", { + # Test IPW method specifically to ensure trimming is applied correctly + skip_on_cran() + library(data.table) + + set.seed(8888) + n_units <- 200 + + n_per_cell <- n_units / 4 + + # Create unit-level data + unit_data <- data.table( + id = 1:n_units, + treat = rep(c(0, 1), each = n_units/2), + partition = rep(rep(c(0, 1), each = n_per_cell), 2) + ) + + # Add covariates - treated units have higher x1 + unit_data[, x1 := ifelse(treat == 1, rnorm(.N, mean = 1.5, sd = 1), rnorm(.N, mean = -1.5, sd = 1))] + unit_data[, x2 := rnorm(.N)] + + # Add "almost treated" controls to trigger trimming + n_high_ps <- 8 + high_ps_indices <- sample(which(unit_data$treat == 0), n_high_ps) + unit_data[high_ps_indices, x1 := rnorm(n_high_ps, mean = 6, sd = 0.1)] + + # Expand to panel + test_data <- rbindlist(lapply(c(1, 2), function(t) { + dt <- copy(unit_data) + dt[, year := t] + dt + })) + setorder(test_data, id, year) + + # Generate outcome + true_att <- 1.5 + test_data[, outcome := ( + 1 + + 0.5 * partition + + 0.3 * (year - 1) + + 0.4 * treat + + 0.3 * x1 + 0.2 * x2 + + true_att * treat * (year - 1) * partition + + rnorm(.N, sd = 1) + )] + + # Run DDD with IPW method + result <- suppressWarnings( + ddd( + yname = "outcome", + tname = "year", + idname = "id", + gname = "treat", + pname = "partition", + xformla = ~ x1 + x2, + data = test_data, + panel = TRUE, + est_method = "ipw", + boot = FALSE + ) + ) + + # Result should be valid + expect_true(!is.null(result)) + expect_true(inherits(result, "ddd")) + expect_true(!is.na(result$ATT)) + expect_true(!is.na(result$se)) +}) + +test_that("End-to-end Panel DDD estimation with REG method (no trimming needed)", { + # REG method doesn't use propensity scores, so trimming shouldn't affect it + skip_on_cran() + library(data.table) + + set.seed(9999) + n_units <- 200 + + n_per_cell <- n_units / 4 + + # Create unit-level data + unit_data <- data.table( + id = 1:n_units, + treat = rep(c(0, 1), each = n_units/2), + partition = rep(rep(c(0, 1), each = n_per_cell), 2) + ) + + # Add covariates + unit_data[, x1 := ifelse(treat == 1, rnorm(.N, mean = 1, sd = 1), rnorm(.N, mean = -1, sd = 1))] + unit_data[, x2 := rnorm(.N)] + + # Add "almost treated" controls (shouldn't matter for REG) + n_high_ps <- 5 + high_ps_indices <- sample(which(unit_data$treat == 0), n_high_ps) + unit_data[high_ps_indices, x1 := rnorm(n_high_ps, mean = 5, sd = 0.1)] + + # Expand to panel + test_data <- rbindlist(lapply(c(1, 2), function(t) { + dt <- copy(unit_data) + dt[, year := t] + dt + })) + setorder(test_data, id, year) + + # Generate outcome + true_att <- 2.5 + test_data[, outcome := ( + 1 + + 0.5 * partition + + 0.3 * (year - 1) + + 0.4 * treat + + 0.3 * x1 + 0.2 * x2 + + true_att * treat * (year - 1) * partition + + rnorm(.N, sd = 1) + )] + + # Run DDD with REG method + result <- suppressWarnings( + ddd( + yname = "outcome", + tname = "year", + idname = "id", + gname = "treat", + pname = "partition", + xformla = ~ x1 + x2, + data = test_data, + panel = TRUE, + est_method = "reg", + boot = FALSE + ) + ) + + # Result should be valid + expect_true(!is.null(result)) + expect_true(inherits(result, "ddd")) + expect_true(!is.na(result$ATT)) + expect_true(!is.na(result$se)) +})