Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ test_print_cohorts_direct.R
test_rcs_all_methods.R
test_inffunc.R
/.quarto/
..Rcheck/00check.log
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: triplediff
Title: Triple-Difference Estimators
Version: 0.2.0
Version: 0.2.1
Authors@R: c(person("Marcelo", "Ortiz-Villavicencio", email = "[email protected]", role = c("aut", "cre")),
person("Pedro H. C.", "Sant'Anna", email = "[email protected]", role = c("aut"))
)
Expand Down
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,10 @@

* Replaced `parglm` with `fastglm` to avoid issues related to parglm's scheduled archival on 2026-01-29.
* Added support for unbalanced panel data and repeated cross-sectional data by properly implementing the `allow_unbalanced_panel` parameter across all functions.

# triplediff 0.2.1
* Add asymmetric propensity score trimming for control units with pscore >= 0.995.
* Add partition-specific collinearity detection with two-stage checking.
* Add comprehensive test suite including Monte Carlo coverage test when trimming.


222 changes: 175 additions & 47 deletions R/compute_nuisances.R

Large diffs are not rendered by default.

160 changes: 160 additions & 0 deletions R/preprocess.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,60 @@
#' @import BMisc
#' @export
NULL
#--------------------------------------------------
# Helper function to check partition-specific collinearity
# Returns a list with:
# - collinear_vars: named list mapping dropped variables to the partition(s) where they're collinear
# - all_collinear: vector of all variables that should be dropped globally
check_partition_collinearity <- function(data, subgroup_col, cov_cols, tol = 1e-6) {
# Get unique subgroups (should be 1, 2, 3, 4)
subgroups <- sort(unique(data[[subgroup_col]]))

# We check comparisons: 4 vs 3, 4 vs 2, 4 vs 1
# These are the partitions used in the DDD estimation
comparison_groups <- subgroups[subgroups < 4]

# Track which variables are collinear in which partition
partition_collinear <- list()

for (comp_group in comparison_groups) {
# Subset data to subgroup 4 and the comparison group
subset_data <- data[data[[subgroup_col]] %in% c(4, comp_group), ]

# Get covariate matrix for this subset
cov_subset <- as.matrix(subset_data[, cov_cols, with = FALSE])

# Skip if no observations (shouldn't happen, but be safe)
if (nrow(cov_subset) == 0) next

# Use QR decomposition to detect collinearity
qr_subset <- qr(cov_subset, tol = tol)
rank_subset <- qr_subset$rank

# Get indices of linearly independent columns
non_collinear_indices <- qr_subset$pivot[seq_len(rank_subset)]

# Find collinear variables in this subset
collinear_in_subset <- setdiff(cov_cols, cov_cols[non_collinear_indices])

if (length(collinear_in_subset) > 0) {
partition_name <- paste0("subgroup 4 vs ", comp_group)
for (var in collinear_in_subset) {
if (is.null(partition_collinear[[var]])) {
partition_collinear[[var]] <- partition_name
} else {
partition_collinear[[var]] <- c(partition_collinear[[var]], partition_name)
}
}
}
}

return(list(
collinear_vars = partition_collinear,
all_collinear = names(partition_collinear)
))
}

#--------------------------------------------------
# Function to pre-process the data to use on ddd estimator

Expand Down Expand Up @@ -253,13 +307,48 @@ run_nopreprocess_2periods <- function(yname,
rank_m <- qr_m$rank
# Get the indices of the non-collinear columns
non_collinear_indices <- qr_m$pivot[seq_len(rank_m)]
# Check if any covariates were dropped due to global collinearity (following DRDID approach)
dropped_covariates_global <- setdiff(colnames(cov_m), colnames(cov_m)[non_collinear_indices])
if (length(dropped_covariates_global) > 0) {
warning("The following covariates were dropped due to global collinearity: ", paste(dropped_covariates_global, collapse = ", "))
}
# Drop the collinear columns from the data.table
cleaned_data <- cleaned_data[, c(seq(1,idx_static_vars,1), non_collinear_indices + idx_static_vars), with = FALSE]

# drop the intercept
#cleaned_data[, (idx_static_vars+1) := NULL]
cleaned_data[, "(Intercept)" := NULL]

# Check for partition-specific collinearity (after global check)
# Get remaining covariate column names (excluding static vars)
cov_cols_remaining <- setdiff(names(cleaned_data), c("id", "y", "post", "treat", "period", "partition", "weights", "cluster", "subgroup"))
if (length(cov_cols_remaining) > 0) {
partition_check <- check_partition_collinearity(cleaned_data, "subgroup", cov_cols_remaining)

if (length(partition_check$all_collinear) > 0) {
# Build informative warning message
partition_warnings <- sapply(names(partition_check$collinear_vars), function(var) {
partitions <- paste(partition_check$collinear_vars[[var]], collapse = ", ")
paste0(" - ", var, " (collinear in: ", partitions, ")")
})
warning("The following covariates were dropped due to partition-specific collinearity:\n",
paste(partition_warnings, collapse = "\n"))

# Drop these covariates globally
cols_to_keep <- setdiff(names(cleaned_data), partition_check$all_collinear)
cleaned_data <- cleaned_data[, ..cols_to_keep]
}
}

# Update xformula to reflect dropped covariates (following DRDID approach)
# Get final covariate column names after all collinearity checks
final_cov_cols <- setdiff(names(cleaned_data), c("id", "y", "post", "treat", "period", "partition", "weights", "cluster", "subgroup"))
if (length(final_cov_cols) > 0) {
xformla <- stats::as.formula(paste("~", paste(final_cov_cols, collapse = " + ")))
} else {
xformla <- ~1
}

out <- list(preprocessed_data = cleaned_data,
xformula = xformla,
est_method = est_method,
Expand Down Expand Up @@ -603,12 +692,47 @@ run_preprocess_2Periods <- function(yname,
rank_m <- qr_m$rank
# Get the indices of the non-collinear columns
non_collinear_indices <- qr_m$pivot[seq_len(rank_m)]
# Check if any covariates were dropped due to global collinearity (following DRDID approach)
dropped_covariates_global <- setdiff(colnames(cov_m), colnames(cov_m)[non_collinear_indices])
if (length(dropped_covariates_global) > 0) {
warning("The following covariates were dropped due to global collinearity: ", paste(dropped_covariates_global, collapse = ", "))
}
# Drop the collinear columns from the data.table
cleaned_data <- cleaned_data[, c(seq(1,idx_static_vars,1), non_collinear_indices + idx_static_vars), with = FALSE]

# drop the intercept
cleaned_data[, "(Intercept)" := NULL]

# Check for partition-specific collinearity (after global check)
# Get remaining covariate column names (excluding static vars)
cov_cols_remaining <- setdiff(names(cleaned_data), c("id", "y", "post", "treat", "period", "partition", "weights", "cluster", "subgroup"))
if (length(cov_cols_remaining) > 0) {
partition_check <- check_partition_collinearity(cleaned_data, "subgroup", cov_cols_remaining)

if (length(partition_check$all_collinear) > 0) {
# Build informative warning message
partition_warnings <- sapply(names(partition_check$collinear_vars), function(var) {
partitions <- paste(partition_check$collinear_vars[[var]], collapse = ", ")
paste0(" - ", var, " (collinear in: ", partitions, ")")
})
warning("The following covariates were dropped due to partition-specific collinearity:\n",
paste(partition_warnings, collapse = "\n"))

# Drop these covariates globally
cols_to_keep <- setdiff(names(cleaned_data), partition_check$all_collinear)
cleaned_data <- cleaned_data[, ..cols_to_keep]
}
}

# Update xformula to reflect dropped covariates (following DRDID approach)
# Get final covariate column names after all collinearity checks
final_cov_cols <- setdiff(names(cleaned_data), c("id", "y", "post", "treat", "period", "partition", "weights", "cluster", "subgroup"))
if (length(final_cov_cols) > 0) {
xformla <- stats::as.formula(paste("~", paste(final_cov_cols, collapse = " + ")))
} else {
xformla <- ~1
}

out <- list(preprocessed_data = cleaned_data,
xformula = xformla,
tname = tname,
Expand Down Expand Up @@ -1037,9 +1161,45 @@ run_preprocess_multPeriods <- function(yname,
data = dta,
na.action = na.pass))
}

# Remove collinear variables (following DRDID approach)
# Determine the number of static columns (before covariates)
# Static cols: id, y, first_treat, period, partition, weights, [cluster]
idx_static_vars <- ifelse(is.null(cluster), 6, 7)

# Convert the covariate columns (including intercept) to a matrix
cov_m <- as.matrix(cleaned_data[, -c(1:idx_static_vars), with = FALSE])

# Only check for collinearity if there are covariates beyond the intercept
if (ncol(cov_m) > 1) {
# Use the qr() function to detect collinear columns
qr_m <- qr(cov_m, tol = 1e-6)
# Get the rank of the matrix
rank_m <- qr_m$rank
# Get the indices of the non-collinear columns
non_collinear_indices <- qr_m$pivot[seq_len(rank_m)]
# Check if any covariates were dropped due to global collinearity
dropped_covariates_global <- setdiff(colnames(cov_m), colnames(cov_m)[non_collinear_indices])
if (length(dropped_covariates_global) > 0) {
warning("The following covariates were dropped due to global collinearity: ", paste(dropped_covariates_global, collapse = ", "))
}
# Drop the collinear columns from the data.table
cleaned_data <- cleaned_data[, c(seq(1, idx_static_vars, 1), non_collinear_indices + idx_static_vars), with = FALSE]
}

# drop the intercept
cleaned_data[, "(Intercept)" := NULL]

# Update xformula to reflect dropped covariates (following DRDID approach)
# Get final covariate column names after collinearity check
# Static cols in multi-period: id, y, first_treat, period, partition, weights, [cluster]
final_cov_cols <- setdiff(names(cleaned_data), c("id", "y", "first_treat", "period", "partition", "weights", "cluster"))
if (length(final_cov_cols) > 0) {
xformla <- stats::as.formula(paste("~", paste(final_cov_cols, collapse = " + ")))
} else {
xformla <- ~1
}

# order dataset wrt idname and tname
setorder(cleaned_data, "id", "period")

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Triple Differences Estimators <img src="man/figures/triplediff-logo.png" align="right" alt="" width="155" />

![](https://img.shields.io/badge/release%20lifecycle-alpha-orange.svg)
[![](https://img.shields.io/badge/devel%20version-0.2.0-blue.svg)](https://github.com/marcelortizv/triplediff)
[![](https://img.shields.io/badge/devel%20version-0.2.1-blue.svg)](https://github.com/marcelortizv/triplediff)
[![](https://img.shields.io/badge/doi-10.48550/arXiv.2505.09942-yellow.svg)](https://doi.org/10.48550/arXiv.2505.09942)

<!-- README.md is generated from README.Rmd. Please edit that file -->
Expand Down
Binary file modified man/figures/README-unnamed-chunk-10-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading