BLM variance decomposition - overestimated worker effects for low values of real alpha

Hi there,

I am currently trying to use the BLM estimator and have noticed that I get non-zero values for worker effects where I expect none. I then went back to the [BLM example](https://tlamadon.github.io/pytwoway/notebooks/blm_example.html) and used this to simulate a range of values for both, alpha and psi. The below code simulates datasets and conducts a variance decomposition. I then plot the results by contrasting the estimated psi and alpha values with the real ones.

```

import numpy as np
import bipartitepandas as bpd
import pytwoway as tw
from pytwoway import constraints as cons
from matplotlib import pyplot as plt


nl = 3 # Number of worker types
nk = 3 # Number of firm types
blm_params = tw.blm_params({
    'nl': nl, 'nk': nk,
    'a1_mu': 0, 'a1_sig': 1.5, 'a2_mu': 0, 'a2_sig': 1.5,
    's1_low': 0.5, 's1_high': 1.5, 's2_low': 0.5, 's2_high': 1.5, 'verbose': 0
})
cluster_params = bpd.cluster_params({
    'measures': bpd.measures.CDFs(),
    'grouping': bpd.grouping.KMeans(n_clusters=nk),
    'is_sorted': True,
    'copy': False
})
clean_params = bpd.clean_params({
    'drop_returns': 'returners',
    'copy': False
})


real_values_a = []
real_values_psi = []
real_values_cov = []
estimated_a = []
estimated_psi = []
estimated_cov = []
monotonic_mover = []
monotonic_stayer = []

values = np.arange(0,3,0.05)

for a in values:
    for p in values:
        sim_params = bpd.sim_params({
            'nl': nl, 'nk': nk,
            'alpha_sig': a, 'psi_sig': p, 'w_sig': 0.6,
            'c_sort': 0, 'c_netw': 0, 'c_sig': 1
        })

        sim_data = bpd.SimBipartite(sim_params).simulate()

        # Convert into BipartitePandas DataFrame
        sim_data = bpd.BipartiteDataFrame(sim_data)
        # Clean and collapse
        sim_data = sim_data.clean(clean_params).collapse(is_sorted=True, copy=False)
        # Cluster
        sim_data = sim_data.cluster(cluster_params)
        # Convert to event study format
        sim_data = sim_data.to_eventstudy(is_sorted=True, copy=False)

        sim_data_observed = sim_data.loc[:, ['i', 'j1', 'j2', 'y1', 'y2', 't11', 't12', 't21', 't22', 'g1', 'g2', 'w1', 'w2', 'm']]
        sim_data_unobserved = sim_data.loc[:, ['alpha1', 'alpha2', 'k1', 'k2', 'l1', 'l2', 'psi1', 'psi2']]

        movers = sim_data_observed.get_worker_m(is_sorted=True)
        jdata = sim_data_observed.loc[movers, :]
        sdata = sim_data_observed.loc[~movers, :]


        # Initialize BLM estimator
        blm_fit = tw.BLMEstimator(blm_params)
        # Fit BLM estimator
        blm_fit.fit(jdata=jdata, sdata=sdata, n_init=40, n_best=5, ncore=16)
        # Store best model
        blm_model = blm_fit.model

        liks1 = blm_model.liks1

        print('Log-likelihoods monotonic (movers):', np.min(np.diff(liks1)) >= 0)
        monotonic_mover = monotonic_mover + [np.min(np.diff(liks1)) >= 0]
        liks0 = blm_model.liks0
        print('Log-likelihoods monotonic (stayers):', np.min(np.diff(liks0)) >= 0)
        monotonic_stayer = monotonic_stayer + [np.min(np.diff(liks0)) >= 0]

        # Initialize BLM variance decomposition estimator
        blm_fit = tw.BLMVarianceDecomposition(blm_params)
        # Fit BLM estimator
        blm_fit.fit(
            jdata=jdata, sdata=sdata,
            blm_model=blm_model,
            n_samples=20,
            Q_var=[
                        tw.Q.VarAlpha(),
                        tw.Q.VarPsi(),
                        tw.Q.VarPsiPlusAlpha()
                    ],
            ncore=16
        )

        for k, v in blm_fit.res.items():
            print(f'{k!r}: {v.mean()}')

        real_values_a = real_values_a + [np.var(sim_data_unobserved['alpha1'])]
        real_values_psi = real_values_psi + [np.var(sim_data_unobserved['psi1'])]
        real_values_cov = real_values_cov + [np.cov(sim_data_unobserved['psi1'], sim_data_unobserved['alpha1'], ddof=0)[0, 1]]
        estimated_a = estimated_a + [np.mean(blm_fit.res['var(alpha)'])]
        estimated_psi = estimated_psi + [np.mean(blm_fit.res['var(psi)'])]
        estimated_cov = estimated_cov + [np.mean(blm_fit.res['cov(psi, alpha)'])]


points = plt.scatter(real_values_psi, estimated_psi, c=real_values_a)
cbar = plt.colorbar(points)
cbar.set_label("real alpha")
plt.xlabel("Real")
plt.ylabel("Estimated")
plt.title("Psi")
plt.plot([0, np.max([np.max(real_values_psi), np.max(estimated_psi)])], [0, np.max([np.max(real_values_psi), np.max(estimated_psi)])])
plt.savefig("./Figures/Jan24/BLM_psi_plot.png")
plt.show()

points = plt.scatter(real_values_a, estimated_a, c=real_values_psi)
cbar = plt.colorbar(points)
cbar.set_label("real psi")
plt.xlabel("Real")
plt.ylabel("Estimated")
plt.title("Alpha")
plt.plot([0, np.max([np.max(real_values_a), np.max(estimated_a)])], [0, np.max([np.max(real_values_a), np.max(estimated_a)])])
plt.savefig("./Figures/Jan24/BLM_alpha_plot.png")
plt.show()

points = plt.scatter(real_values_cov, estimated_cov)
plt.xlabel("Real")
plt.ylabel("Estimated")
plt.title("Cov(psi, alpha)")
plt.plot([np.min([np.min(real_values_cov), np.min(estimated_cov)]), np.max([np.max(real_values_cov), np.max(estimated_cov)])], [np.min([np.min(real_values_cov), np.min(estimated_cov)]), np.max([np.max(real_values_cov), np.max(estimated_cov)])])
plt.savefig("./Figures/Jan24/BLM_cov_plot.png")
plt.show()

import pandas as pd

data = pd.DataFrame(
     {'real_a': real_values_a,
     'estimated_a': estimated_a,
     'real_psi': real_values_psi
    })

selected = data.loc[data['real_a']<=0.5]

points = plt.scatter(selected['real_a'], selected['estimated_a'], c=selected['real_psi'])
cbar = plt.colorbar(points)
cbar.set_label("real psi")
plt.xlabel("Real")
plt.ylabel("Estimated")
plt.title("Alpha")
plt.plot([0, np.max([np.max(selected['real_a']), np.max(selected['estimated_a'])])], [0, np.max([np.max(selected['real_a']), np.max(selected['estimated_a'])])])
plt.savefig("./Figures/Jan24/BLM_alpha_zoom_plot.png")
plt.show()

print("Monotonic movers: " + str(monotonic_mover.count(True)))
print("Non-monotonic movers: " + str(monotonic_mover.count(False)))

print("Monotonic stayers: " + str(monotonic_stayer.count(True)))
print("Non-monotonic stayers: " + str(monotonic_stayer.count(False)))

```


While the firm effects appear to be recovered well, many low alpha values are frequently overestimated. 

Psi looks good overall:
![BLM_psi_plot](https://github.com/tlamadon/pytwoway/assets/48337370/4e4b10b8-6147-4c3e-b1e2-2a54a81f13ef)

Alpha is also doing well for higher values:
![BLM_alpha_plot](https://github.com/tlamadon/pytwoway/assets/48337370/ba6757e2-974a-4a1f-89eb-db75231f6d95)

But when zooming in on lower values:
![BLM_alpha_zoom_plot](https://github.com/tlamadon/pytwoway/assets/48337370/364c2f08-4b51-47b8-bb64-5db0a7463da8)

Lower psi values look fine:
![BLM_psi_zoom_plot](https://github.com/tlamadon/pytwoway/assets/48337370/bca98bb7-53d2-48cb-9d85-9618de0d3399)



I suspect that I am missing some parameter I should change for the variance decomposition to work as expected. Could you maybe hint me which one might control this behaviour?

I also found that several log-likelihoods for movers (around 1/3) are not monotonic and none are monotonic for stayers. Do you maybe have any hints on why this might be the case?


Thank you!

Best,
Martin

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

BLM variance decomposition - overestimated worker effects for low values of real alpha #55

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

BLM variance decomposition - overestimated worker effects for low values of real alpha #55

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions