Skip to content

Dev Branch for Benchmarks #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions data/benchmark1000_14_7.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
,twfe_statsmodels,event_study_statsmodels,twfe_fixest_compressed,twfe_fixest,event_study_fixest,duck_mundlak,duck_mundlak_event,N,T,T0
0,3.524947166442871,3.8372230529785156,1.391993761062622,7.044592618942261,1.0851032733917236,0.025202512741088867,0.014397859573364258,1000,14,7
1,3.586045503616333,3.8418402671813965,0.1384873390197754,0.1546306610107422,0.16529107093811035,0.025308609008789062,0.016378164291381836,1000,14,7
2,3.498058557510376,3.7085466384887695,0.13840508460998535,0.15597152709960938,0.16927051544189453,0.02742910385131836,0.01682305335998535,1000,14,7
4 changes: 4 additions & 0 deletions data/benchmark1000_28_7.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
,twfe_statsmodels,event_study_statsmodels,twfe_fixest_compressed,twfe_fixest,event_study_fixest,duck_mundlak,duck_mundlak_event,N,T,T0
0,6.603718280792236,8.796046733856201,0.21630644798278809,0.17549967765808105,0.23299789428710938,0.03426074981689453,0.018976926803588867,1000,28,7
1,6.700540781021118,9.069744110107422,0.17578649520874023,0.17682957649230957,0.23126792907714844,0.03191041946411133,0.01741170883178711,1000,28,7
2,7.750951290130615,8.068708181381226,0.1765155792236328,0.17908310890197754,0.22946405410766602,0.03245687484741211,0.016439437866210938,1000,28,7
4 changes: 4 additions & 0 deletions data/benchmark1000_42_7.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
,twfe_statsmodels,event_study_statsmodels,twfe_fixest_compressed,twfe_fixest,event_study_fixest,duck_mundlak,duck_mundlak_event,N,T,T0
0,12.831220388412476,12.35423469543457,0.2366042137145996,0.1718442440032959,0.3121190071105957,0.038512468338012695,0.01694774627685547,1000,42,7
1,14.023149490356445,12.454074621200562,0.19021940231323242,0.17430496215820312,0.3458282947540283,0.03442883491516113,0.016968488693237305,1000,42,7
2,11.490947008132935,12.42515230178833,0.17162346839904785,0.17591285705566406,0.32230591773986816,0.03468441963195801,0.016361236572265625,1000,42,7
4 changes: 4 additions & 0 deletions data/benchmark_twfe_fixest_1000000_14_7.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
,twfe_fixest,N,T,T0
0,3.9811136722564697,1000000,14,7
1,3.9024667739868164,1000000,14,7
2,3.94681453704834,1000000,14,7
4 changes: 4 additions & 0 deletions data/benchmark_twfe_fixest_100000_14_7.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
,twfe_fixest,N,T,T0
0,0.6330029964447021,100000,14,7
1,0.5921123027801514,100000,14,7
2,0.6203744411468506,100000,14,7
4 changes: 4 additions & 0 deletions data/benchmark_twfe_fixest_100000_28_7.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
,twfe_fixest,N,T,T0
0,0.9080502986907959,100000,28,7
1,0.9044373035430908,100000,28,7
2,0.8909964561462402,100000,28,7
4 changes: 4 additions & 0 deletions data/benchmark_twfe_fixest_100000_42_7.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
,twfe_fixest,N,T,T0
0,1.3070573806762695,100000,42,7
1,1.1962721347808838,100000,42,7
2,1.2006416320800781,100000,42,7
4 changes: 4 additions & 0 deletions data/benchmark_twfe_fixest_10000_14_7.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
,twfe_fixest,N,T,T0
0,0.24356484413146973,10000,14,7
1,0.25767970085144043,10000,14,7
2,0.24818849563598633,10000,14,7
4 changes: 4 additions & 0 deletions data/benchmark_twfe_fixest_10000_28_7.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
,twfe_fixest,N,T,T0
0,0.2967844009399414,10000,28,7
1,0.3072526454925537,10000,28,7
2,0.30050110816955566,10000,28,7
4 changes: 4 additions & 0 deletions data/benchmark_twfe_fixest_10000_42_7.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
,twfe_fixest,N,T,T0
0,0.30553364753723145,10000,42,7
1,0.32700133323669434,10000,42,7
2,0.3112492561340332,10000,42,7
276 changes: 257 additions & 19 deletions pixi.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
authors = [{name = "Alexander Fischer", email = "[email protected]"}]
dependencies = ["pyfixest", "duckreg", "statsmodels", "ipykernel", "duckdb"]
dependencies = [ "duckreg", "statsmodels", "ipykernel", "duckdb", "pyfixest>=0.25.2", "pyarrow>=17.0.0,<18", "ipywidgets>=8.1.5,<9", "watermark>=2.5.0,<3"]
description = "Add a short description here"
name = "panel-scale-code"
requires-python = ">= 3.11"
Expand Down
362 changes: 216 additions & 146 deletions timining_benchmarks.ipynb

Large diffs are not rendered by default.

40 changes: 35 additions & 5 deletions utils/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,45 @@ def __init__(self, N, T, T0, iter):
self.df = generate_benchmark_data(N = N, T = T, T0 = T0)
self.timings = {}

def mark(self, fun):
def mark(self, fun, reps = 0):

fun_name = fun.__name__ # Get the name of the function
if fun in ["duckreg", "feols_compressed"]:
fun_name = f"{fun.__name__} + reps = {reps}"
else:
fun_name = fun.__name__

self.timings[fun_name] = np.zeros(self.iter)

for i in range(self.iter):
start = time.time()
fun(df = self.df, T = self.T, T0 = self.T0)
self.timings[fun_name][i] = time.time() - start

try:
start = time.time()
timeout = 600 # 10 minutes in seconds

# Run the function in a while loop to monitor its execution time
while True:
if time.time() - start > timeout:
print(f"Timeout reached for {fun_name}. Assigning np.nan.")
self.timings[fun_name][i] = np.nan
break

try:
fun(df=self.df, T=self.T, T0=self.T0, reps=reps)
self.timings[fun_name][i] = time.time() - start
break # Break if the function completes within the time limit

except MemoryError:
print(f"MemoryError encountered in {fun_name}. Assigning np.nan.")
self.timings[fun_name][i] = np.nan
break

except Exception as e:
print(f"An unexpected error occurred: {e}")
self.timings[fun_name][i] = np.nan

self.timings_df = pd.DataFrame(self.timings)

return self.timings_df

def to_dataframe(self):

Expand Down
67 changes: 42 additions & 25 deletions utils/estimators.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,40 @@
import pyfixest as pf
import numpy as np
from duckreg.estimators import DuckMundlak
from duckreg.estimators import DuckMundlak, DuckMundlakEventStudy
import statsmodels.formula.api as smf

def twfe_fixest(df, T, T0):
try:
print("MemoryError: Not enough memory to run twfe_fixest.")
m = pf.feols("Y~W | unit + time", df).tidy()
except MemoryError:
print("MemoryError: Not enough memory to run.")
def twfe_fixest(df, T, T0, reps = 0):
m = pf.feols("Y~W | unit + time", df, lean = True)
return None

def twfe_fixest_compressed(df, T, T0):
try:
m = pf.feols("Y~W | unit + time", df, use_compression = True).tidy()
except MemoryError:
print("MemoryError: Not enough memory to run.")
def twfe_fixest_compressed(df, T, T0, reps = 0):
m = pf.feols(
fml = "Y~W | unit + time",
data = df,
use_compression = True,
reps = reps
).tidy()

return None

def twfe_statsmodels(df, T, T0):
try:
m = smf.ols(formula="Y ~ W + C(unit) + C(time)", data=df).fit()
except MemoryError:
print("MemoryError: Not enough memory to run.")
def twfe_statsmodels(df, T, T0, reps = 0):
m = smf.ols(formula="Y ~ W + C(unit) + C(time)", data=df).fit()
return None

def event_study_fixest(df, T, T0):
try:
df["ever_treated"] = df.groupby("unit")["W"].transform("max")
m = pf.feols(f"Y ~ i(time, ever_treated, ref = {T0-1}) | unit + time", df)
except MemoryError:
print("MemoryError: Not enough memory to run.")
def event_study_statsmodels(df, T, T0, reps = 0):
df['ever_treated'] = df.groupby('unit')['W'].transform('max')
m = smf.ols(
formula="Y ~ C(time):C(ever_treated) + C(unit) + C(time)",
data=df
).fit()
return m

def event_study_fixest(df, T, T0, reps = 0):
df["ever_treated"] = df.groupby("unit")["W"].transform("max")
m = pf.feols(f"Y ~ i(time, ever_treated, ref = {T0-1}) | unit + time", df, lean = True)
return None

def duck_mundlak(df, T, T0):
def duck_mundlak(df, T, T0, reps = 0):

mundlak = DuckMundlak(
db_name="benchmarks.db",
Expand All @@ -44,9 +44,26 @@ def duck_mundlak(df, T, T0):
unit_col="unit",
time_col="time",
cluster_col="unit",
n_bootstraps=1,
n_bootstraps=reps,
seed = 929
)
mundlak.fit()

return mundlak

def duck_mundlak_event(df, T, T0, reps):

mundlak = DuckMundlakEventStudy(
db_name="benchmarks.db",
table_name="data",
outcome_var="Y",
treatment_col="W",
unit_col="unit",
time_col="time",
cluster_col="unit",
n_bootstraps=0, # set to nonzero to get block-bootstrapped standard errors
seed=42,
pre_treat_interactions=True,
)

return mundlak