From 7f9cc5d9eafa097065ccfc1d216a2e17465b320a Mon Sep 17 00:00:00 2001 From: Brad Hackinen Date: Mon, 3 Feb 2025 16:03:03 -0800 Subject: [PATCH 1/2] Added offset option for fepois --- pyfixest/estimation/FixestMulti_.py | 14 ++++++++--- pyfixest/estimation/estimation.py | 14 +++++++++-- pyfixest/estimation/feols_.py | 27 +++++++++++++++++++++ pyfixest/estimation/fepois_.py | 13 +++++++--- pyfixest/estimation/model_matrix_fixest_.py | 11 +++++++-- 5 files changed, 68 insertions(+), 11 deletions(-) diff --git a/pyfixest/estimation/FixestMulti_.py b/pyfixest/estimation/FixestMulti_.py index 1602d9072..63079d936 100644 --- a/pyfixest/estimation/FixestMulti_.py +++ b/pyfixest/estimation/FixestMulti_.py @@ -135,6 +135,7 @@ def _prepare_estimation( ssc: Optional[dict[str, Union[str, bool]]] = None, fixef_rm: str = "none", drop_intercept: bool = False, + offset: Optional[Union[None, str]] = None, ) -> None: """ Prepare model for estimation. @@ -153,9 +154,13 @@ def _prepare_estimation( A string or dictionary specifying the type of variance-covariance matrix to use for inference. See `feols()` or `fepois()`. - weights : Union[None, np.ndarray], optional - An array of weights. - Either None or a 1D array of length N. Default is None. + weights : Union[None, str], optional + Default is None. Weights for WLS estimation. If None, all observations + are weighted equally. If a string, the name of the column in `data` that + contains the weights. + offset : Union[None, str], optional + Default is None. Offset variable for Poisson regression. If None, no offset. + If a string, the name of the column in `data` that contains the offset. ssc : dict[str, str], optional A dictionary specifying the type of standard errors to use for inference. See `feols()` or `fepois()`. @@ -179,6 +184,7 @@ def _prepare_estimation( self._is_multiple_estimation = False self._drop_intercept = False self._weights = weights + self._offset = offset self._has_weights = False if weights is not None: self._has_weights = True @@ -247,6 +253,7 @@ def _estimate_all_models( _ssc_dict = self._ssc_dict _drop_intercept = self._drop_intercept _weights = self._weights + _offset = self._offset _fixef_tol = self._fixef_tol _weights_type = self._weights_type _lean = self._lean @@ -339,6 +346,7 @@ def _estimate_all_models( drop_intercept=_drop_intercept, weights=_weights, weights_type=_weights_type, + offset=_offset, solver=solver, demeaner_backend=demeaner_backend, collin_tol=collin_tol, diff --git a/pyfixest/estimation/estimation.py b/pyfixest/estimation/estimation.py index b65d81483..9deec439a 100644 --- a/pyfixest/estimation/estimation.py +++ b/pyfixest/estimation/estimation.py @@ -1,5 +1,5 @@ from collections.abc import Mapping -from typing import Any, Optional, Union +from typing import Any, Optional, Union, Sequence import pandas as pd @@ -503,6 +503,7 @@ def fepois( fml: str, data: DataFrameType, # type: ignore vcov: Optional[Union[VcovTypeOptions, dict[str, str]]] = None, + offset: Union[None, str] = None, ssc: Optional[dict[str, Union[str, bool]]] = None, fixef_rm: FixedRmOptions = "none", fixef_tol: float = 1e-08, @@ -545,6 +546,10 @@ def fepois( Type of variance-covariance matrix for inference. Options include "iid", "hetero", "HC1", "HC2", "HC3", or a dictionary for CRV1/CRV3 inference. + offset : Union[None, str], optional + Default is None. Offset variable for Poisson regression. If None, no offset. + If a string, the name of the column in `data` that contains the offset. + ssc : str A ssc object specifying the small sample correction for inference. @@ -670,6 +675,7 @@ def fepois( data=data, vcov=vcov, weights=weights, + offset=offset, ssc=ssc, fixef_rm=fixef_rm, collin_tol=collin_tol, @@ -702,7 +708,7 @@ def fepois( ) fixest._prepare_estimation( - "fepois", fml, vcov, weights, ssc, fixef_rm, drop_intercept + "fepois", fml, vcov, weights, ssc, fixef_rm, drop_intercept, offset=offset ) if fixest._is_iv: raise NotImplementedError( @@ -1001,6 +1007,7 @@ def _estimation_input_checks( split: Optional[str], fsplit: Optional[str], separation_check: Optional[list[str]] = None, + offset: Optional[Union[None, str]] = None, ): if not isinstance(fml, str): raise TypeError("fml must be a string") @@ -1027,6 +1034,9 @@ def _estimation_input_checks( if weights is not None: assert weights in data.columns, "weights must be a column in data" + if offset is not None: + assert offset in data.columns, "offset must be a column in data" + bool_args = [copy_data, store_data, lean] for arg in bool_args: if not isinstance(arg, bool): diff --git a/pyfixest/estimation/feols_.py b/pyfixest/estimation/feols_.py index 113751fb9..fc67f490c 100644 --- a/pyfixest/estimation/feols_.py +++ b/pyfixest/estimation/feols_.py @@ -229,6 +229,7 @@ def __init__( context: Union[int, Mapping[str, Any]] = 0, sample_split_var: Optional[str] = None, sample_split_value: Optional[Union[str, int, float]] = None, + offset: Optional[str] = None, ) -> None: self._sample_split_value = sample_split_value self._sample_split_var = sample_split_var @@ -250,6 +251,7 @@ def __init__( self._drop_intercept = drop_intercept self._weights_name = weights self._weights_type = weights_type + self._offset_name = offset self._has_weights = weights is not None self._collin_tol = collin_tol self._fixef_tol = fixef_tol @@ -347,6 +349,7 @@ def prepare_model_matrix(self): drop_singletons=self._drop_singletons, drop_intercept=self._drop_intercept, weights=self._weights_name, + offset=self._offset_name, context=self._context, ) @@ -357,6 +360,7 @@ def prepare_model_matrix(self): self._endogvar = mm_dict.get("endogvar") self._Z = mm_dict.get("Z") self._weights_df = mm_dict.get("weights_df") + self._offset_df = mm_dict.get("offset_df") self._na_index = mm_dict.get("na_index") self._na_index_str = mm_dict.get("na_index_str") self._icovars = mm_dict.get("icovars") @@ -372,8 +376,10 @@ def prepare_model_matrix(self): self._data = _drop_cols(self._data, self._na_index) self._weights = self._set_weights() + self._offset = self._set_offset() self._N, self._N_rows = self._set_nobs() + def _set_nobs(self) -> tuple[int, int]: """ Fetch the number of observations used in fitting the regression model. @@ -411,6 +417,27 @@ def _set_weights(self) -> np.ndarray: _weights = np.ones(N) return _weights.reshape((N, 1)) + + def _set_offset(self) -> np.ndarray: + """ + Return the offset used in the regression model. + + Returns + ------- + np.ndarray + The offset used in the regression model. + If no offset is used, returns an array of zeros + with the same length as the dependent variable array. + """ + + N = len(self._Y) + + if self._offset_df is not None: + _offset = self._offset_df.to_numpy() + else: + _offset = np.zeros(N) + + return _offset.reshape((N, 1)) def demean(self): "Demean the dependent variable and covariates by the fixed effect(s)." diff --git a/pyfixest/estimation/fepois_.py b/pyfixest/estimation/fepois_.py index fd7ae90a9..6fd6017a3 100644 --- a/pyfixest/estimation/fepois_.py +++ b/pyfixest/estimation/fepois_.py @@ -80,6 +80,7 @@ def __init__( drop_intercept: bool, weights: Optional[str], weights_type: Optional[str], + offset: Optional[float], collin_tol: float, fixef_tol: float, lookup_demeaned_data: dict[str, pd.DataFrame], @@ -105,6 +106,7 @@ def __init__( drop_intercept=drop_intercept, weights=weights, weights_type=weights_type, + offset=offset, collin_tol=collin_tol, fixef_tol=fixef_tol, lookup_demeaned_data=lookup_demeaned_data, @@ -147,7 +149,7 @@ def prepare_model_matrix(self): raise ValueError( "The dependent variable must be a weakly positive integer." ) - + # check for separation na_separation: list[int] = [] if ( @@ -168,6 +170,7 @@ def prepare_model_matrix(self): self._Y.drop(na_separation, axis=0, inplace=True) self._X.drop(na_separation, axis=0, inplace=True) self._fe.drop(na_separation, axis=0, inplace=True) + self._offset = np.delete(self._offset,na_separation,axis=0) # _offset is a numpy array so we use delete instead of drop self._data.drop(na_separation, axis=0, inplace=True) self._N = self._Y.shape[0] @@ -217,6 +220,7 @@ def get_fit(self) -> None: _Y = self._Y _X = self._X _fe = self._fe + _offset = self._offset _N = self._N _convergence = self.convergence # False _maxiter = self.maxiter @@ -251,13 +255,13 @@ def compute_deviance(_Y: np.ndarray, mu: np.ndarray): _mean = np.mean(_Y) mu = (_Y + _mean) / 2 eta = np.log(mu) - Z = eta + _Y / mu - 1 + Z = eta - _offset + _Y / mu - 1 reg_Z = Z.copy() last = compute_deviance(_Y, mu) else: # update w and Z - Z = eta + _Y / mu - 1 # eq (8) + Z = eta - _offset + _Y / mu - 1 # eq (8) reg_Z = Z.copy() # eq (9) # tighten HDFE tolerance - currently not possible with PyHDFE @@ -294,7 +298,7 @@ def compute_deviance(_Y: np.ndarray, mu: np.ndarray): mu_old = mu.copy() # more updating - eta = Z - resid + eta = Z - resid + _offset mu = np.exp(eta) # same criterion as fixest @@ -695,3 +699,4 @@ def _fepois_input_checks(drop_singletons: bool, tol: float, maxiter: int): raise TypeError("maxiter must be integer.") if maxiter <= 0: raise AssertionError("maxiter must be greater than 0.") + diff --git a/pyfixest/estimation/model_matrix_fixest_.py b/pyfixest/estimation/model_matrix_fixest_.py index 82720fe86..c5cca4d72 100644 --- a/pyfixest/estimation/model_matrix_fixest_.py +++ b/pyfixest/estimation/model_matrix_fixest_.py @@ -17,6 +17,7 @@ def model_matrix_fixest( data: pd.DataFrame, drop_singletons: bool = False, weights: Optional[str] = None, + offset: Optional[str] = None, drop_intercept=False, context: Union[int, Mapping[str, Any]] = 0, ) -> dict: @@ -123,6 +124,7 @@ def model_matrix_fixest( **({"fml_first_stage": fml_first_stage} if _is_iv else {}), **({"fe": wrap_factorize(fval)} if fval != "0" else {}), **({"weights": weights} if weights is not None else {}), + **({"offset": offset} if offset is not None else {}), } FML = Formula(**fml_kwargs) @@ -130,7 +132,7 @@ def model_matrix_fixest( mm = FML.get_model_matrix( data, output="pandas", context={"factorize": factorize, **_context} ) - endogvar = Z = weights_df = fe = None + endogvar = Z = weights_df = offset_df = fe = None Y = mm["fml_second_stage"]["lhs"] X = mm["fml_second_stage"]["rhs"] @@ -142,8 +144,10 @@ def model_matrix_fixest( fe = mm["fe"] if weights is not None: weights_df = mm["weights"] + if offset is not None: + offset_df = mm["offset"] - for df in [Y, X, Z, endogvar, weights_df]: + for df in [Y, X, Z, endogvar, weights_df,offset_df]: if df is not None: cols_to_convert = df.select_dtypes(exclude=["int64", "float64"]).columns if cols_to_convert.size > 0: @@ -196,6 +200,8 @@ def model_matrix_fixest( endogvar = endogvar[keep_idx] if weights_df is not None: weights_df = weights_df[keep_idx] + if offset is not None: + offset_df = offset_df[keep_idx] na_index = _get_na_index(data.shape[0], Y.index) na_index_str = ",".join(str(x) for x in na_index) @@ -213,6 +219,7 @@ def model_matrix_fixest( "endogvar": endogvar, "Z": Z, "weights_df": weights_df, + "offset_df": offset_df, "na_index": na_index, "na_index_str": na_index_str, "icovars": _icovars, From da5e761268fb41dbdcae440ebcdbfff15dfaad9a Mon Sep 17 00:00:00 2001 From: Alexander Fischer Date: Tue, 4 Feb 2025 21:36:49 +0100 Subject: [PATCH 2/2] add tests --- tests/test_vs_fixest.py | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/tests/test_vs_fixest.py b/tests/test_vs_fixest.py index b317b6822..4d41d854f 100644 --- a/tests/test_vs_fixest.py +++ b/tests/test_vs_fixest.py @@ -406,8 +406,9 @@ def test_single_fit_feols_empty( @pytest.mark.parametrize("fml", ols_fmls) @pytest.mark.parametrize("adj", [True]) @pytest.mark.parametrize("cluster_adj", [True]) +@pytest.mark.parametrize("offset", [False, True]) def test_single_fit_fepois( - data_fepois, dropna, inference, f3_type, fml, adj, cluster_adj + data_fepois, dropna, inference, f3_type, fml, adj, cluster_adj, offset ): global test_counter_fepois test_counter_fepois += 1 @@ -418,6 +419,11 @@ def test_single_fit_fepois( ssc_ = ssc(adj=adj, cluster_adj=cluster_adj) data = data_fepois + if offset: + data["offset_var"] = np.ones(data.shape[0]) * 5 + offset_var = "offset_var" + else: + offset_var = None if dropna: data = data.dropna() @@ -433,14 +439,28 @@ def test_single_fit_fepois( r_fml = _c_to_as_factor(fml) r_inference = _get_r_inference(inference) - mod = pf.fepois(fml=fml, data=data, vcov=inference, ssc=ssc_, iwls_tol=1e-10) - r_fixest = fixest.fepois( - ro.Formula(r_fml), - vcov=r_inference, - data=data_r, - ssc=fixest.ssc(adj, "none", cluster_adj, "min", "min", False), - glm_tol=1e-10, - ) + py_arg_dict = { + "fml": fml, + "data": data, + "vcov": inference, + "ssc": ssc_, + "iwls_tol": iwls_tol, + } + + r_arg_dict = { + "fml": ro.Formula(r_fml), + "vcov": r_inference, + "data": data_r, + "ssc": fixest.ssc(adj, "none", cluster_adj, "min", "min", False), + "glm_tol": iwls_tol, + } + + if offset: + py_arg_dict["offset"] = offset_var + r_arg_dict["offset"] = ro.Formula("~" + offset_var) + + mod = pf.fepois(**py_arg_dict) + r_fixest = fixest.fepois(**r_arg_dict) py_coef = mod.coef().xs("X1") py_se = mod.se().xs("X1")