diff --git a/README.md b/README.md index fa4768b..1289a7d 100644 --- a/README.md +++ b/README.md @@ -308,4 +308,4 @@ If you want to cite CSDID, you can use the following BibTeX entry: year = {2024}, url = {https://github.com/d2cml-ai/csdid} } -``` +``` \ No newline at end of file diff --git a/csdid/aggte_fnc/utils.py b/csdid/aggte_fnc/utils.py index a1db61e..f858905 100644 --- a/csdid/aggte_fnc/utils.py +++ b/csdid/aggte_fnc/utils.py @@ -105,8 +105,9 @@ def AGGTEobj(overall_att=None, out1 = np.column_stack((out["overall_att"], out["overall_se"], overall_cband_lower, overall_cband_upper)) out1 = np.round(out1, 4) overall_sig = (overall_cband_upper < 0) | (overall_cband_lower > 0) - overall_sig[np.isnan(overall_sig)] = False - overall_sig_text = np.where(overall_sig, "*", "") + overall_sig = np.asarray(overall_sig) + overall_sig = np.where(np.isnan(overall_sig), False, overall_sig) + overall_sig_text = np.atleast_1d(np.where(overall_sig, "*", "")) out1 = np.column_stack((out1, overall_sig_text)) print("\n") @@ -193,4 +194,3 @@ def AGGTEobj(overall_att=None, return out - diff --git a/csdid/att_gt.py b/csdid/att_gt.py index de84803..ae75ece 100644 --- a/csdid/att_gt.py +++ b/csdid/att_gt.py @@ -23,9 +23,10 @@ def __init__(self, yname, tname, idname, gname, data, control_group = ['nevertre cband = False, biters = 1000, alp = 0.05 ): dp = pre_process_did( - yname=yname, tname = tname, idname=idname, gname = gname, - data = data, control_group=control_group, anticipation=anticipation, - xformla=xformla, panel=panel, allow_unbalanced_panel=allow_unbalanced_panel, cband=cband, clustervar=None, weights_name=None + yname=yname, tname=tname, idname=idname, gname=gname, + data=data, control_group=control_group, anticipation=anticipation, + xformla=xformla, panel=panel, allow_unbalanced_panel=allow_unbalanced_panel, + cband=cband, clustervar=clustervar, weights_name=weights_name ) dp['biters'] = biters diff --git a/csdid/attgt_fnc/compute_att_gt.py b/csdid/attgt_fnc/compute_att_gt.py index d4917fa..4831afc 100644 --- a/csdid/attgt_fnc/compute_att_gt.py +++ b/csdid/attgt_fnc/compute_att_gt.py @@ -1,6 +1,7 @@ import numpy as np, pandas as pd import patsy -from drdid import drdid, reg_did, ipwd_did +from drdid import reg_did +from csdid.attgt_fnc import drdid_trim from csdid.utils.bmisc import panel2cs2 import warnings @@ -37,6 +38,26 @@ def compute_att_gt(dp, est_method = "dr", base_period = 'varying'): att_est, group, year, post_array = [], [], [], [] + def build_covariates(formula, frame): + try: + _, cov = fml(formula, data=frame, return_type='dataframe') + except Exception as e: + try: + cov = patsy.dmatrix(formula, data=frame, return_type='dataframe') + except Exception as e2: + print(f"Warning: Formula processing failed: {e2}") + y_str, x_str = formula.split("~") + xs1 = x_str.split('+') + xs1_col_names = [x.strip() for x in xs1 if x.strip() != '1'] + n_dis = len(frame) + ones = np.ones((n_dis, 1)) + try: + cov = frame[xs1_col_names].to_numpy() + cov = np.append(cov, ones, axis=1) + except Exception: + cov = ones + return np.array(cov) + def add_att_data(att = 0, pst = 0, inf_f = []): inf_func.append(inf_f) att_est.append(att) @@ -75,12 +96,6 @@ def add_att_data(att = 0, pst = 0, inf_f = []): f"There are no pre-treatment periods for the group first treated at {g}. Units from this group are dropped." ) - # If we are in the universal base period - if base_period == 'universal' and pret == tn: - # Normalize results to zero and skip computation - add_att_data(att=0, pst=0, inf_f=np.zeros(len(data))) - continue - # For non-never treated groups, set up the control group indicator 'C' if not never_treated: # Units that are either never treated (gname == 0) or treated in the future @@ -113,13 +128,12 @@ def add_att_data(att = 0, pst = 0, inf_f = []): # ----------------------------------------------------------------------------- # Debugging and validation - if base_period == 'universal' and pret == tn: - # Normalize results to zero and break the loop - add_att_data(att=0, pst=post_treat, inf_f=np.zeros(len(data))) - continue - # Post-treatment dummy variable + pret_year = tlist[pret] post_treat = 1 * (g <= tn) + if base_period == 'universal' and pret_year == tn: + add_att_data(att=0, pst=post_treat, inf_f=np.zeros(n)) + continue # Subset the data for the current and pretreatment periods disdat = data[(data[tname] == tn) | (data[tname] == tlist[pret])] @@ -139,9 +153,9 @@ def add_att_data(att = 0, pst = 0, inf_f = []): C = disdat.C w = disdat.w - ypre = disdat.y0 if tn > pret else disdat.y1 - ypost = disdat.y0 if tn < pret else disdat.y1 - _, covariates = fml(xformla, data = disdat, return_type = 'dataframe') + ypre = disdat.y0 if tn > pret_year else disdat.y1 + ypost = disdat.y0 if tn < pret_year else disdat.y1 + covariates = build_covariates(xformla, disdat) G, C, w, ypre = map(np.array, [G, C, w, ypre]) ypost, covariates = map(np.array, [ypost, covariates]) @@ -151,9 +165,9 @@ def add_att_data(att = 0, pst = 0, inf_f = []): elif est_method == "reg": est_att_f = reg_did.reg_did_panel elif est_method == "ipw": - est_att_f = ipwd_did.std_ipw_did_panel + est_att_f = drdid_trim.std_ipw_did_panel elif est_method == "dr": - est_att_f = drdid.drdid_panel + est_att_f = drdid_trim.drdid_panel att_gt, att_inf_func = est_att_f(ypost, ypre, G, i_weights=w, covariates=covariates) @@ -224,21 +238,7 @@ def add_att_data(att = 0, pst = 0, inf_f = []): continue # return (inf_func) - try: - _, covariates = fml(xformla, data = disdat, return_type = 'dataframe') - covariates = np.array(covariates) - except Exception as e: - print(f"Warning: Formula processing failed: {e}") - y_str, x_str = xformla.split("~") - xs1 = x_str.split('+') - xs1_col_names = [x.strip() for x in xs1 if x.strip() != '1'] - n_dis = len(disdat) - ones = np.ones((n_dis, 1)) - try: - covariates = disdat[xs1_col_names].to_numpy() - covariates = np.append(covariates, ones, axis=1) - except: - covariates = ones + covariates = build_covariates(xformla, disdat) #----------------------------------------------------------------------------- # code for actually computing att(g,t) @@ -250,9 +250,9 @@ def add_att_data(att = 0, pst = 0, inf_f = []): elif est_method == "reg": est_att_f = reg_did.reg_did_rc elif est_method == "ipw": - est_att_f = ipwd_did.std_ipw_did_rc + est_att_f = drdid_trim.std_ipw_did_rc elif est_method == "dr": - est_att_f = drdid.drdid_rc + est_att_f = drdid_trim.drdid_rc att_gt, att_inf_func = est_att_f(y=Y, post=post, D = G, i_weights=w, covariates=covariates) # print(att_inf_func) @@ -504,4 +504,4 @@ def add_att_data(att = 0, pst = 0, inf_f = []): # "att" : att_est, # 'post ': post_array # } -# return (output, np.array(inf_func)) \ No newline at end of file +# return (output, np.array(inf_func)) diff --git a/csdid/attgt_fnc/drdid_trim.py b/csdid/attgt_fnc/drdid_trim.py new file mode 100644 index 0000000..d9014ea --- /dev/null +++ b/csdid/attgt_fnc/drdid_trim.py @@ -0,0 +1,359 @@ +import numpy as np +import statsmodels.api as sm + + +def _add_intercept(covariates, n): + if covariates is None: + return np.ones((n, 1)) + covariates = np.asarray(covariates) + if covariates.ndim == 1: + covariates = covariates[:, None] + if np.all(covariates[:, 0] == 1): + return covariates + return np.column_stack((np.ones(n), covariates)) + + +def _trim_pscore(ps_fit, D, trim_level): + ps_fit = np.minimum(ps_fit, 1 - 1e-6) + trim_ps = ps_fit < 1.01 + controls = D == 0 + trim_ps[controls] = ps_fit[controls] < trim_level + return ps_fit, trim_ps + + +def std_ipw_did_panel(y1, y0, D, covariates=None, i_weights=None, trim_level=0.995): + D = np.asarray(D).flatten() + n = len(D) + delta_y = np.asarray(y1 - y0).flatten() + int_cov = _add_intercept(covariates, n) + + if i_weights is None: + i_weights = np.ones(n) + else: + i_weights = np.asarray(i_weights).flatten() + if np.min(i_weights) < 0: + raise ValueError("i_weights must be non-negative") + i_weights = i_weights / np.mean(i_weights) + + pscore_model = sm.GLM(D, int_cov, family=sm.families.Binomial(), freq_weights=i_weights) + pscore_results = pscore_model.fit() + if not pscore_results.converged: + print("Warning: glm algorithm did not converge") + if np.any(np.isnan(pscore_results.params)): + raise ValueError( + "Propensity score model coefficients have NA components. \n" + "Multicollinearity (or lack of variation) of covariates is a likely reason." + ) + ps_fit, trim_ps = _trim_pscore(pscore_results.predict(), D, trim_level) + + w_treat = trim_ps * i_weights * D + w_cont = trim_ps * i_weights * ps_fit * (1 - D) / (1 - ps_fit) + att_treat = w_treat * delta_y + att_cont = w_cont * delta_y + + eta_treat = np.mean(att_treat) / np.mean(w_treat) + eta_cont = np.mean(att_cont) / np.mean(w_cont) + ipw_att = eta_treat - eta_cont + + score_ps = i_weights[:, np.newaxis] * (D - ps_fit)[:, np.newaxis] * int_cov + Hessian_ps = pscore_results.cov_params() * n + asy_lin_rep_ps = np.dot(score_ps, Hessian_ps) + + inf_treat = (att_treat - w_treat * eta_treat) / np.mean(w_treat) + inf_cont_1 = att_cont - w_cont * eta_cont + pre_m2 = w_cont * (delta_y - eta_cont) + M2 = np.mean(pre_m2[:, np.newaxis] * int_cov, axis=0) + inf_cont_2 = np.dot(asy_lin_rep_ps, M2) + + inf_control = (inf_cont_1 + inf_cont_2) / np.mean(w_cont) + att_inf_func = inf_treat - inf_control + return ipw_att, att_inf_func + + +def drdid_panel(y1, y0, D, covariates=None, i_weights=None, trim_level=0.995): + D = np.asarray(D).flatten() + n = len(D) + deltaY = np.asarray(y1 - y0).flatten() + int_cov = _add_intercept(covariates, n) + + if i_weights is None: + i_weights = np.ones(n) + else: + i_weights = np.asarray(i_weights).flatten() + if np.min(i_weights) < 0: + raise ValueError("i_weights must be non-negative") + i_weights = i_weights / np.mean(i_weights) + + pscore_model = sm.GLM(D, int_cov, family=sm.families.Binomial(), freq_weights=i_weights) + pscore_results = pscore_model.fit() + if not pscore_results.converged: + print("Warning: glm algorithm did not converge") + if np.any(np.isnan(pscore_results.params)): + raise ValueError( + "Propensity score model coefficients have NA components. \n" + "Multicollinearity (or lack of variation) of covariates is a likely reason." + ) + ps_fit, trim_ps = _trim_pscore(pscore_results.predict(), D, trim_level) + + mask = D == 0 + reg_model = sm.WLS(deltaY[mask], int_cov[mask], weights=i_weights[mask]) + reg_results = reg_model.fit() + if np.any(np.isnan(reg_results.params)): + raise ValueError( + "Outcome regression model coefficients have NA components. \n" + "Multicollinearity (or lack of variation) of covariates is a likely reason." + ) + out_delta = np.dot(int_cov, reg_results.params) + + w_treat = trim_ps * i_weights * D + w_cont = trim_ps * i_weights * ps_fit * (1 - D) / (1 - ps_fit) + dr_att_treat = w_treat * (deltaY - out_delta) + dr_att_cont = w_cont * (deltaY - out_delta) + + eta_treat = np.mean(dr_att_treat) / np.mean(w_treat) + eta_cont = np.mean(dr_att_cont) / np.mean(w_cont) + dr_att = eta_treat - eta_cont + + weights_ols = i_weights * (1 - D) + wols_x = weights_ols[:, np.newaxis] * int_cov + wols_eX = weights_ols[:, np.newaxis] * (deltaY - out_delta)[:, np.newaxis] * int_cov + XpX_inv = np.linalg.inv(np.dot(wols_x.T, int_cov) / n) + asy_lin_rep_wols = np.dot(wols_eX, XpX_inv) + + score_ps = i_weights[:, np.newaxis] * (D - ps_fit)[:, np.newaxis] * int_cov + Hessian_ps = pscore_results.cov_params() * n + asy_lin_rep_ps = np.dot(score_ps, Hessian_ps) + + inf_treat_1 = dr_att_treat - w_treat * eta_treat + M1 = np.mean(w_treat[:, np.newaxis] * int_cov, axis=0) + inf_treat_2 = np.dot(asy_lin_rep_wols, M1) + inf_treat = (inf_treat_1 - inf_treat_2) / np.mean(w_treat) + + inf_cont_1 = dr_att_cont - w_cont * eta_cont + M2 = np.mean(w_cont[:, np.newaxis] * (deltaY - out_delta - eta_cont)[:, np.newaxis] * int_cov, axis=0) + inf_cont_2 = np.dot(asy_lin_rep_ps, M2) + M3 = np.mean(w_cont[:, np.newaxis] * int_cov, axis=0) + inf_cont_3 = np.dot(asy_lin_rep_wols, M3) + inf_control = (inf_cont_1 + inf_cont_2 - inf_cont_3) / np.mean(w_cont) + + dr_att_inf_func = inf_treat - inf_control + return dr_att, dr_att_inf_func + + +def std_ipw_did_rc(y, post, D, covariates=None, i_weights=None, trim_level=0.995): + D = np.asarray(D).flatten() + y = np.asarray(y).flatten() + post = np.asarray(post).flatten() + n = len(D) + int_cov = _add_intercept(covariates, n) + + if i_weights is None: + i_weights = np.ones(n) + else: + i_weights = np.asarray(i_weights).flatten() + if np.min(i_weights) < 0: + raise ValueError("i_weights must be non-negative") + i_weights = i_weights / np.mean(i_weights) + + pscore_model = sm.GLM(D, int_cov, family=sm.families.Binomial(), freq_weights=i_weights) + pscore_results = pscore_model.fit() + if not pscore_results.converged: + print("Warning: glm algorithm did not converge") + if np.any(np.isnan(pscore_results.params)): + raise ValueError( + "Propensity score model coefficients have NA components. \n" + "Multicollinearity (or lack of variation) of covariates is a likely reason." + ) + ps_fit, trim_ps = _trim_pscore(pscore_results.predict(), D, trim_level) + + w_treat_pre = trim_ps * i_weights * D * (1 - post) + w_treat_post = trim_ps * i_weights * D * post + w_cont_pre = trim_ps * i_weights * ps_fit * (1 - D) * (1 - post) / (1 - ps_fit) + w_cont_post = trim_ps * i_weights * ps_fit * (1 - D) * post / (1 - ps_fit) + + eta_treat_pre = w_treat_pre * y / np.mean(w_treat_pre) + eta_treat_post = w_treat_post * y / np.mean(w_treat_post) + eta_cont_pre = w_cont_pre * y / np.mean(w_cont_pre) + eta_cont_post = w_cont_post * y / np.mean(w_cont_post) + + att_treat_pre = np.mean(eta_treat_pre) + att_treat_post = np.mean(eta_treat_post) + att_cont_pre = np.mean(eta_cont_pre) + att_cont_post = np.mean(eta_cont_post) + ipw_att = (att_treat_post - att_treat_pre) - (att_cont_post - att_cont_pre) + + score_ps = (i_weights * (D - ps_fit))[:, np.newaxis] * int_cov + Hessian_ps = pscore_results.cov_params() * n + asy_lin_rep_ps = np.dot(score_ps, Hessian_ps) + + inf_treat_pre = eta_treat_pre - w_treat_pre * att_treat_pre / np.mean(w_treat_pre) + inf_treat_post = eta_treat_post - w_treat_post * att_treat_post / np.mean(w_treat_post) + inf_treat = inf_treat_post - inf_treat_pre + + inf_cont_pre = eta_cont_pre - w_cont_pre * att_cont_pre / np.mean(w_cont_pre) + inf_cont_post = eta_cont_post - w_cont_post * att_cont_post / np.mean(w_cont_post) + inf_cont = inf_cont_post - inf_cont_pre + + M2_pre = np.mean((w_cont_pre * (y - att_cont_pre))[:, np.newaxis] * int_cov, axis=0) / np.mean(w_cont_pre) + M2_post = np.mean((w_cont_post * (y - att_cont_post))[:, np.newaxis] * int_cov, axis=0) / np.mean(w_cont_post) + inf_cont_ps = np.dot(asy_lin_rep_ps, (M2_post - M2_pre)) + + inf_cont = inf_cont + inf_cont_ps + att_inf_func = inf_treat - inf_cont + return ipw_att, att_inf_func + + +def drdid_rc(y, post, D, covariates=None, i_weights=None, trim_level=0.995): + D = np.asarray(D).flatten() + n = len(D) + y = np.asarray(y).flatten() + post = np.asarray(post).flatten() + int_cov = _add_intercept(covariates, n) + + if i_weights is None: + i_weights = np.ones(n) + else: + i_weights = np.asarray(i_weights).flatten() + if np.min(i_weights) < 0: + raise ValueError("i_weights must be non-negative") + i_weights = i_weights / np.mean(i_weights) + + pscore_model = sm.GLM(D, int_cov, family=sm.families.Binomial(), freq_weights=i_weights) + pscore_results = pscore_model.fit() + if not pscore_results.converged: + print("Warning: glm algorithm did not converge") + if np.any(np.isnan(pscore_results.params)): + raise ValueError( + "Propensity score model coefficients have NA components. \n" + "Multicollinearity (or lack of variation) of covariates is a likely reason." + ) + ps_fit, trim_ps = _trim_pscore(pscore_results.predict(), D, trim_level) + + mask_cont_pre = (D == 0) & (post == 0) + reg_cont_pre = sm.WLS(y[mask_cont_pre], int_cov[mask_cont_pre], weights=i_weights[mask_cont_pre]).fit() + if np.any(np.isnan(reg_cont_pre.params)): + raise ValueError( + "Outcome regression model coefficients have NA components. \n" + "Multicollinearity (or lack of variation) of covariates is a likely reason." + ) + out_y_cont_pre = np.dot(int_cov, reg_cont_pre.params) + + mask_cont_post = (D == 0) & (post == 1) + reg_cont_post = sm.WLS(y[mask_cont_post], int_cov[mask_cont_post], weights=i_weights[mask_cont_post]).fit() + if np.any(np.isnan(reg_cont_post.params)): + raise ValueError( + "Outcome regression model coefficients have NA components. \n" + "Multicollinearity (or lack of variation) of covariates is a likely reason." + ) + out_y_cont_post = np.dot(int_cov, reg_cont_post.params) + + out_y_cont = post * out_y_cont_post + (1 - post) * out_y_cont_pre + + mask_treat_pre = (D == 1) & (post == 0) + reg_treat_pre = sm.WLS(y[mask_treat_pre], int_cov[mask_treat_pre], weights=i_weights[mask_treat_pre]).fit() + out_y_treat_pre = np.dot(int_cov, reg_treat_pre.params) + + mask_treat_post = (D == 1) & (post == 1) + reg_treat_post = sm.WLS(y[mask_treat_post], int_cov[mask_treat_post], weights=i_weights[mask_treat_post]).fit() + out_y_treat_post = np.dot(int_cov, reg_treat_post.params) + + w_treat_pre = trim_ps * i_weights * D * (1 - post) + w_treat_post = trim_ps * i_weights * D * post + w_cont_pre = trim_ps * i_weights * ps_fit * (1 - D) * (1 - post) / (1 - ps_fit) + w_cont_post = trim_ps * i_weights * ps_fit * (1 - D) * post / (1 - ps_fit) + w_d = trim_ps * i_weights * D + w_dt1 = trim_ps * i_weights * D * post + w_dt0 = trim_ps * i_weights * D * (1 - post) + + eta_treat_pre = w_treat_pre * (y - out_y_cont) / np.mean(w_treat_pre) + eta_treat_post = w_treat_post * (y - out_y_cont) / np.mean(w_treat_post) + eta_cont_pre = w_cont_pre * (y - out_y_cont) / np.mean(w_cont_pre) + eta_cont_post = w_cont_post * (y - out_y_cont) / np.mean(w_cont_post) + + eta_d_post = w_d * (out_y_treat_post - out_y_cont_post) / np.mean(w_d) + eta_dt1_post = w_dt1 * (out_y_treat_post - out_y_cont_post) / np.mean(w_dt1) + eta_d_pre = w_d * (out_y_treat_pre - out_y_cont_pre) / np.mean(w_d) + eta_dt0_pre = w_dt0 * (out_y_treat_pre - out_y_cont_pre) / np.mean(w_dt0) + + att_treat_pre = np.mean(eta_treat_pre) + att_treat_post = np.mean(eta_treat_post) + att_cont_pre = np.mean(eta_cont_pre) + att_cont_post = np.mean(eta_cont_post) + att_d_post = np.mean(eta_d_post) + att_dt1_post = np.mean(eta_dt1_post) + att_d_pre = np.mean(eta_d_pre) + att_dt0_pre = np.mean(eta_dt0_pre) + + dr_att = (att_treat_post - att_treat_pre) - (att_cont_post - att_cont_pre) + \ + (att_d_post - att_dt1_post) - (att_d_pre - att_dt0_pre) + + weights_ols_pre = i_weights * (1 - D) * (1 - post) + wols_x_pre = weights_ols_pre[:, np.newaxis] * int_cov + wols_eX_pre = weights_ols_pre[:, np.newaxis] * (y - out_y_cont_pre)[:, np.newaxis] * int_cov + XpX_inv_pre = np.linalg.inv(np.dot(wols_x_pre.T, int_cov) / n) + asy_lin_rep_ols_pre = np.dot(wols_eX_pre, XpX_inv_pre) + + weights_ols_post = i_weights * (1 - D) * post + wols_x_post = weights_ols_post[:, np.newaxis] * int_cov + wols_eX_post = weights_ols_post[:, np.newaxis] * (y - out_y_cont_post)[:, np.newaxis] * int_cov + XpX_inv_post = np.linalg.inv(np.dot(wols_x_post.T, int_cov) / n) + asy_lin_rep_ols_post = np.dot(wols_eX_post, XpX_inv_post) + + weights_ols_pre_treat = i_weights * D * (1 - post) + wols_x_pre_treat = weights_ols_pre_treat[:, np.newaxis] * int_cov + wols_eX_pre_treat = weights_ols_pre_treat[:, np.newaxis] * (y - out_y_treat_pre)[:, np.newaxis] * int_cov + XpX_inv_pre_treat = np.linalg.inv(np.dot(wols_x_pre_treat.T, int_cov) / n) + asy_lin_rep_ols_pre_treat = np.dot(wols_eX_pre_treat, XpX_inv_pre_treat) + + weights_ols_post_treat = i_weights * D * post + wols_x_post_treat = weights_ols_post_treat[:, np.newaxis] * int_cov + wols_eX_post_treat = weights_ols_post_treat[:, np.newaxis] * (y - out_y_treat_post)[:, np.newaxis] * int_cov + XpX_inv_post_treat = np.linalg.inv(np.dot(wols_x_post_treat.T, int_cov) / n) + asy_lin_rep_ols_post_treat = np.dot(wols_eX_post_treat, XpX_inv_post_treat) + + score_ps = i_weights[:, np.newaxis] * (D - ps_fit)[:, np.newaxis] * int_cov + Hessian_ps = pscore_results.cov_params() * n + asy_lin_rep_ps = np.dot(score_ps, Hessian_ps) + + inf_treat_pre = eta_treat_pre - w_treat_pre * att_treat_pre / np.mean(w_treat_pre) + inf_treat_post = eta_treat_post - w_treat_post * att_treat_post / np.mean(w_treat_post) + + M1_post = -np.mean(w_treat_post[:, np.newaxis] * post[:, np.newaxis] * int_cov, axis=0) / np.mean(w_treat_post) + M1_pre = -np.mean(w_treat_pre[:, np.newaxis] * (1 - post)[:, np.newaxis] * int_cov, axis=0) / np.mean(w_treat_pre) + + inf_treat_or_post = np.dot(asy_lin_rep_ols_post, M1_post) + inf_treat_or_pre = np.dot(asy_lin_rep_ols_pre, M1_pre) + + inf_cont_pre = eta_cont_pre - w_cont_pre * att_cont_pre / np.mean(w_cont_pre) + inf_cont_post = eta_cont_post - w_cont_post * att_cont_post / np.mean(w_cont_post) + + M2_pre = np.mean(w_cont_pre[:, np.newaxis] * (y - out_y_cont - att_cont_pre)[:, np.newaxis] * int_cov, axis=0) / np.mean(w_cont_pre) + M2_post = np.mean(w_cont_post[:, np.newaxis] * (y - out_y_cont - att_cont_post)[:, np.newaxis] * int_cov, axis=0) / np.mean(w_cont_post) + inf_cont_ps = np.dot(asy_lin_rep_ps, M2_post - M2_pre) + + M3_post = -np.mean(w_cont_post[:, np.newaxis] * post[:, np.newaxis] * int_cov, axis=0) / np.mean(w_cont_post) + M3_pre = -np.mean(w_cont_pre[:, np.newaxis] * (1 - post)[:, np.newaxis] * int_cov, axis=0) / np.mean(w_cont_pre) + inf_cont_or_post = np.dot(asy_lin_rep_ols_post, M3_post) + inf_cont_or_pre = np.dot(asy_lin_rep_ols_pre, M3_pre) + + inf_eff1 = eta_d_post - w_d * att_d_post / np.mean(w_d) + inf_eff2 = eta_dt1_post - w_dt1 * att_dt1_post / np.mean(w_dt1) + inf_eff3 = eta_d_pre - w_d * att_d_pre / np.mean(w_d) + inf_eff4 = eta_dt0_pre - w_dt0 * att_dt0_pre / np.mean(w_dt0) + inf_eff = (inf_eff1 - inf_eff2) - (inf_eff3 - inf_eff4) + + mom_post = np.mean((w_d[:, np.newaxis] / np.mean(w_d) - w_dt1[:, np.newaxis] / np.mean(w_dt1)) * int_cov, axis=0) + mom_pre = np.mean((w_d[:, np.newaxis] / np.mean(w_d) - w_dt0[:, np.newaxis] / np.mean(w_dt0)) * int_cov, axis=0) + inf_or_post = np.dot(asy_lin_rep_ols_post_treat - asy_lin_rep_ols_post, mom_post) + inf_or_pre = np.dot(asy_lin_rep_ols_pre_treat - asy_lin_rep_ols_pre, mom_pre) + + inf_treat_or = inf_treat_or_post + inf_treat_or_pre + inf_cont_or = inf_cont_or_post + inf_cont_or_pre + inf_or = inf_or_post - inf_or_pre + + inf_treat = inf_treat_post - inf_treat_pre + inf_treat_or + inf_cont = inf_cont_post - inf_cont_pre + inf_cont_ps + inf_cont_or + dr_att_inf_func1 = inf_treat - inf_cont + dr_att_inf_func = dr_att_inf_func1 + inf_eff + inf_or + + return dr_att, dr_att_inf_func diff --git a/csdid/attgt_fnc/preprocess_did.py b/csdid/attgt_fnc/preprocess_did.py index ec4d82b..d4aebaf 100644 --- a/csdid/attgt_fnc/preprocess_did.py +++ b/csdid/attgt_fnc/preprocess_did.py @@ -259,7 +259,8 @@ def pre_process_did(yname, tname, idname, gname, data: pd.DataFrame, ) -> dict: n, t = data.shape - control_group = control_group[0] + if isinstance(control_group, (list, tuple)): + control_group = control_group[0] columns = [idname, tname, yname, gname] # print(columns) # Columns @@ -277,11 +278,14 @@ def pre_process_did(yname, tname, idname, gname, data: pd.DataFrame, # if xformla is None: try: - _, x_cov = fml(xformla, data = data, return_type='dataframe') + try: + _, x_cov = fml(xformla, data=data, return_type='dataframe') + except Exception: + x_cov = patsy.dmatrix(xformla, data=data, return_type='dataframe') _, n_cov = x_cov.shape data = pd.concat([data[columns], x_cov], axis=1) - data = data.assign(w = w) - except: + data = data.assign(w=w) + except Exception: data = data.assign(intercept = 1) clms = columns + ['intercept'] n_cov = len(data.columns) @@ -352,9 +356,11 @@ def pre_process_did(yname, tname, idname, gname, data: pd.DataFrame, true_rep_cross_section = True if panel: - if allow_unbalanced_panel: - panel = False - true_rep_cross_section = False + if allow_unbalanced_panel: + try: + n = data[idname].nunique() + except Exception: + n = len(pd.unique(data[idname])) else: keepers = data.dropna().index n = len(data[idname].unique) @@ -425,4 +431,4 @@ def pre_process_did(yname, tname, idname, gname, data: pd.DataFrame, 'true_rep_cross_section': true_rep_cross_section, 'clustervars': clustervar } - return did_params \ No newline at end of file + return did_params diff --git a/readme.md b/readme.md index e4be0d9..1289a7d 100644 --- a/readme.md +++ b/readme.md @@ -296,4 +296,16 @@ out.aggte(typec='group'); Of particular interest is the `Overall ATT` in the results. Here, we estimate that increasing the minimum wage decreased teen employment by -3.1% and the effect is marginally statistically significant. \ No newline at end of file +3.1% and the effect is marginally statistically significant. + +# How to cite +If you want to cite CSDID, you can use the following BibTeX entry: + +``` python +@software{csdid, + author = {Callaway, Brantly and Sant'Anna, Pedro HC and Quispe, Alexander and Guevara, Carlos}, + title = {{csdid: Difference-in-Differences with Multiple Time Periods in Python}}, + year = {2024}, + url = {https://github.com/d2cml-ai/csdid} +} +``` \ No newline at end of file