-
Notifications
You must be signed in to change notification settings - Fork 12
BLM variance decomposition - overestimated worker effects for low values of real alpha #55
Description
Hi there,
I am currently trying to use the BLM estimator and have noticed that I get non-zero values for worker effects where I expect none. I then went back to the BLM example and used this to simulate a range of values for both, alpha and psi. The below code simulates datasets and conducts a variance decomposition. I then plot the results by contrasting the estimated psi and alpha values with the real ones.
import numpy as np
import bipartitepandas as bpd
import pytwoway as tw
from pytwoway import constraints as cons
from matplotlib import pyplot as plt
nl = 3 # Number of worker types
nk = 3 # Number of firm types
blm_params = tw.blm_params({
'nl': nl, 'nk': nk,
'a1_mu': 0, 'a1_sig': 1.5, 'a2_mu': 0, 'a2_sig': 1.5,
's1_low': 0.5, 's1_high': 1.5, 's2_low': 0.5, 's2_high': 1.5, 'verbose': 0
})
cluster_params = bpd.cluster_params({
'measures': bpd.measures.CDFs(),
'grouping': bpd.grouping.KMeans(n_clusters=nk),
'is_sorted': True,
'copy': False
})
clean_params = bpd.clean_params({
'drop_returns': 'returners',
'copy': False
})
real_values_a = []
real_values_psi = []
real_values_cov = []
estimated_a = []
estimated_psi = []
estimated_cov = []
monotonic_mover = []
monotonic_stayer = []
values = np.arange(0,3,0.05)
for a in values:
for p in values:
sim_params = bpd.sim_params({
'nl': nl, 'nk': nk,
'alpha_sig': a, 'psi_sig': p, 'w_sig': 0.6,
'c_sort': 0, 'c_netw': 0, 'c_sig': 1
})
sim_data = bpd.SimBipartite(sim_params).simulate()
# Convert into BipartitePandas DataFrame
sim_data = bpd.BipartiteDataFrame(sim_data)
# Clean and collapse
sim_data = sim_data.clean(clean_params).collapse(is_sorted=True, copy=False)
# Cluster
sim_data = sim_data.cluster(cluster_params)
# Convert to event study format
sim_data = sim_data.to_eventstudy(is_sorted=True, copy=False)
sim_data_observed = sim_data.loc[:, ['i', 'j1', 'j2', 'y1', 'y2', 't11', 't12', 't21', 't22', 'g1', 'g2', 'w1', 'w2', 'm']]
sim_data_unobserved = sim_data.loc[:, ['alpha1', 'alpha2', 'k1', 'k2', 'l1', 'l2', 'psi1', 'psi2']]
movers = sim_data_observed.get_worker_m(is_sorted=True)
jdata = sim_data_observed.loc[movers, :]
sdata = sim_data_observed.loc[~movers, :]
# Initialize BLM estimator
blm_fit = tw.BLMEstimator(blm_params)
# Fit BLM estimator
blm_fit.fit(jdata=jdata, sdata=sdata, n_init=40, n_best=5, ncore=16)
# Store best model
blm_model = blm_fit.model
liks1 = blm_model.liks1
print('Log-likelihoods monotonic (movers):', np.min(np.diff(liks1)) >= 0)
monotonic_mover = monotonic_mover + [np.min(np.diff(liks1)) >= 0]
liks0 = blm_model.liks0
print('Log-likelihoods monotonic (stayers):', np.min(np.diff(liks0)) >= 0)
monotonic_stayer = monotonic_stayer + [np.min(np.diff(liks0)) >= 0]
# Initialize BLM variance decomposition estimator
blm_fit = tw.BLMVarianceDecomposition(blm_params)
# Fit BLM estimator
blm_fit.fit(
jdata=jdata, sdata=sdata,
blm_model=blm_model,
n_samples=20,
Q_var=[
tw.Q.VarAlpha(),
tw.Q.VarPsi(),
tw.Q.VarPsiPlusAlpha()
],
ncore=16
)
for k, v in blm_fit.res.items():
print(f'{k!r}: {v.mean()}')
real_values_a = real_values_a + [np.var(sim_data_unobserved['alpha1'])]
real_values_psi = real_values_psi + [np.var(sim_data_unobserved['psi1'])]
real_values_cov = real_values_cov + [np.cov(sim_data_unobserved['psi1'], sim_data_unobserved['alpha1'], ddof=0)[0, 1]]
estimated_a = estimated_a + [np.mean(blm_fit.res['var(alpha)'])]
estimated_psi = estimated_psi + [np.mean(blm_fit.res['var(psi)'])]
estimated_cov = estimated_cov + [np.mean(blm_fit.res['cov(psi, alpha)'])]
points = plt.scatter(real_values_psi, estimated_psi, c=real_values_a)
cbar = plt.colorbar(points)
cbar.set_label("real alpha")
plt.xlabel("Real")
plt.ylabel("Estimated")
plt.title("Psi")
plt.plot([0, np.max([np.max(real_values_psi), np.max(estimated_psi)])], [0, np.max([np.max(real_values_psi), np.max(estimated_psi)])])
plt.savefig("./Figures/Jan24/BLM_psi_plot.png")
plt.show()
points = plt.scatter(real_values_a, estimated_a, c=real_values_psi)
cbar = plt.colorbar(points)
cbar.set_label("real psi")
plt.xlabel("Real")
plt.ylabel("Estimated")
plt.title("Alpha")
plt.plot([0, np.max([np.max(real_values_a), np.max(estimated_a)])], [0, np.max([np.max(real_values_a), np.max(estimated_a)])])
plt.savefig("./Figures/Jan24/BLM_alpha_plot.png")
plt.show()
points = plt.scatter(real_values_cov, estimated_cov)
plt.xlabel("Real")
plt.ylabel("Estimated")
plt.title("Cov(psi, alpha)")
plt.plot([np.min([np.min(real_values_cov), np.min(estimated_cov)]), np.max([np.max(real_values_cov), np.max(estimated_cov)])], [np.min([np.min(real_values_cov), np.min(estimated_cov)]), np.max([np.max(real_values_cov), np.max(estimated_cov)])])
plt.savefig("./Figures/Jan24/BLM_cov_plot.png")
plt.show()
import pandas as pd
data = pd.DataFrame(
{'real_a': real_values_a,
'estimated_a': estimated_a,
'real_psi': real_values_psi
})
selected = data.loc[data['real_a']<=0.5]
points = plt.scatter(selected['real_a'], selected['estimated_a'], c=selected['real_psi'])
cbar = plt.colorbar(points)
cbar.set_label("real psi")
plt.xlabel("Real")
plt.ylabel("Estimated")
plt.title("Alpha")
plt.plot([0, np.max([np.max(selected['real_a']), np.max(selected['estimated_a'])])], [0, np.max([np.max(selected['real_a']), np.max(selected['estimated_a'])])])
plt.savefig("./Figures/Jan24/BLM_alpha_zoom_plot.png")
plt.show()
print("Monotonic movers: " + str(monotonic_mover.count(True)))
print("Non-monotonic movers: " + str(monotonic_mover.count(False)))
print("Monotonic stayers: " + str(monotonic_stayer.count(True)))
print("Non-monotonic stayers: " + str(monotonic_stayer.count(False)))
While the firm effects appear to be recovered well, many low alpha values are frequently overestimated.
Alpha is also doing well for higher values:

But when zooming in on lower values:

I suspect that I am missing some parameter I should change for the variance decomposition to work as expected. Could you maybe hint me which one might control this behaviour?
I also found that several log-likelihoods for movers (around 1/3) are not monotonic and none are monotonic for stayers. Do you maybe have any hints on why this might be the case?
Thank you!
Best,
Martin

