Skip to content

BLM variance decomposition - overestimated worker effects for low values of real alpha #55

@MartinHaus1993

Description

@MartinHaus1993

Hi there,

I am currently trying to use the BLM estimator and have noticed that I get non-zero values for worker effects where I expect none. I then went back to the BLM example and used this to simulate a range of values for both, alpha and psi. The below code simulates datasets and conducts a variance decomposition. I then plot the results by contrasting the estimated psi and alpha values with the real ones.


import numpy as np
import bipartitepandas as bpd
import pytwoway as tw
from pytwoway import constraints as cons
from matplotlib import pyplot as plt


nl = 3 # Number of worker types
nk = 3 # Number of firm types
blm_params = tw.blm_params({
    'nl': nl, 'nk': nk,
    'a1_mu': 0, 'a1_sig': 1.5, 'a2_mu': 0, 'a2_sig': 1.5,
    's1_low': 0.5, 's1_high': 1.5, 's2_low': 0.5, 's2_high': 1.5, 'verbose': 0
})
cluster_params = bpd.cluster_params({
    'measures': bpd.measures.CDFs(),
    'grouping': bpd.grouping.KMeans(n_clusters=nk),
    'is_sorted': True,
    'copy': False
})
clean_params = bpd.clean_params({
    'drop_returns': 'returners',
    'copy': False
})


real_values_a = []
real_values_psi = []
real_values_cov = []
estimated_a = []
estimated_psi = []
estimated_cov = []
monotonic_mover = []
monotonic_stayer = []

values = np.arange(0,3,0.05)

for a in values:
    for p in values:
        sim_params = bpd.sim_params({
            'nl': nl, 'nk': nk,
            'alpha_sig': a, 'psi_sig': p, 'w_sig': 0.6,
            'c_sort': 0, 'c_netw': 0, 'c_sig': 1
        })

        sim_data = bpd.SimBipartite(sim_params).simulate()

        # Convert into BipartitePandas DataFrame
        sim_data = bpd.BipartiteDataFrame(sim_data)
        # Clean and collapse
        sim_data = sim_data.clean(clean_params).collapse(is_sorted=True, copy=False)
        # Cluster
        sim_data = sim_data.cluster(cluster_params)
        # Convert to event study format
        sim_data = sim_data.to_eventstudy(is_sorted=True, copy=False)

        sim_data_observed = sim_data.loc[:, ['i', 'j1', 'j2', 'y1', 'y2', 't11', 't12', 't21', 't22', 'g1', 'g2', 'w1', 'w2', 'm']]
        sim_data_unobserved = sim_data.loc[:, ['alpha1', 'alpha2', 'k1', 'k2', 'l1', 'l2', 'psi1', 'psi2']]

        movers = sim_data_observed.get_worker_m(is_sorted=True)
        jdata = sim_data_observed.loc[movers, :]
        sdata = sim_data_observed.loc[~movers, :]


        # Initialize BLM estimator
        blm_fit = tw.BLMEstimator(blm_params)
        # Fit BLM estimator
        blm_fit.fit(jdata=jdata, sdata=sdata, n_init=40, n_best=5, ncore=16)
        # Store best model
        blm_model = blm_fit.model

        liks1 = blm_model.liks1

        print('Log-likelihoods monotonic (movers):', np.min(np.diff(liks1)) >= 0)
        monotonic_mover = monotonic_mover + [np.min(np.diff(liks1)) >= 0]
        liks0 = blm_model.liks0
        print('Log-likelihoods monotonic (stayers):', np.min(np.diff(liks0)) >= 0)
        monotonic_stayer = monotonic_stayer + [np.min(np.diff(liks0)) >= 0]

        # Initialize BLM variance decomposition estimator
        blm_fit = tw.BLMVarianceDecomposition(blm_params)
        # Fit BLM estimator
        blm_fit.fit(
            jdata=jdata, sdata=sdata,
            blm_model=blm_model,
            n_samples=20,
            Q_var=[
                        tw.Q.VarAlpha(),
                        tw.Q.VarPsi(),
                        tw.Q.VarPsiPlusAlpha()
                    ],
            ncore=16
        )

        for k, v in blm_fit.res.items():
            print(f'{k!r}: {v.mean()}')

        real_values_a = real_values_a + [np.var(sim_data_unobserved['alpha1'])]
        real_values_psi = real_values_psi + [np.var(sim_data_unobserved['psi1'])]
        real_values_cov = real_values_cov + [np.cov(sim_data_unobserved['psi1'], sim_data_unobserved['alpha1'], ddof=0)[0, 1]]
        estimated_a = estimated_a + [np.mean(blm_fit.res['var(alpha)'])]
        estimated_psi = estimated_psi + [np.mean(blm_fit.res['var(psi)'])]
        estimated_cov = estimated_cov + [np.mean(blm_fit.res['cov(psi, alpha)'])]


points = plt.scatter(real_values_psi, estimated_psi, c=real_values_a)
cbar = plt.colorbar(points)
cbar.set_label("real alpha")
plt.xlabel("Real")
plt.ylabel("Estimated")
plt.title("Psi")
plt.plot([0, np.max([np.max(real_values_psi), np.max(estimated_psi)])], [0, np.max([np.max(real_values_psi), np.max(estimated_psi)])])
plt.savefig("./Figures/Jan24/BLM_psi_plot.png")
plt.show()

points = plt.scatter(real_values_a, estimated_a, c=real_values_psi)
cbar = plt.colorbar(points)
cbar.set_label("real psi")
plt.xlabel("Real")
plt.ylabel("Estimated")
plt.title("Alpha")
plt.plot([0, np.max([np.max(real_values_a), np.max(estimated_a)])], [0, np.max([np.max(real_values_a), np.max(estimated_a)])])
plt.savefig("./Figures/Jan24/BLM_alpha_plot.png")
plt.show()

points = plt.scatter(real_values_cov, estimated_cov)
plt.xlabel("Real")
plt.ylabel("Estimated")
plt.title("Cov(psi, alpha)")
plt.plot([np.min([np.min(real_values_cov), np.min(estimated_cov)]), np.max([np.max(real_values_cov), np.max(estimated_cov)])], [np.min([np.min(real_values_cov), np.min(estimated_cov)]), np.max([np.max(real_values_cov), np.max(estimated_cov)])])
plt.savefig("./Figures/Jan24/BLM_cov_plot.png")
plt.show()

import pandas as pd

data = pd.DataFrame(
     {'real_a': real_values_a,
     'estimated_a': estimated_a,
     'real_psi': real_values_psi
    })

selected = data.loc[data['real_a']<=0.5]

points = plt.scatter(selected['real_a'], selected['estimated_a'], c=selected['real_psi'])
cbar = plt.colorbar(points)
cbar.set_label("real psi")
plt.xlabel("Real")
plt.ylabel("Estimated")
plt.title("Alpha")
plt.plot([0, np.max([np.max(selected['real_a']), np.max(selected['estimated_a'])])], [0, np.max([np.max(selected['real_a']), np.max(selected['estimated_a'])])])
plt.savefig("./Figures/Jan24/BLM_alpha_zoom_plot.png")
plt.show()

print("Monotonic movers: " + str(monotonic_mover.count(True)))
print("Non-monotonic movers: " + str(monotonic_mover.count(False)))

print("Monotonic stayers: " + str(monotonic_stayer.count(True)))
print("Non-monotonic stayers: " + str(monotonic_stayer.count(False)))

While the firm effects appear to be recovered well, many low alpha values are frequently overestimated.

Psi looks good overall:
BLM_psi_plot

Alpha is also doing well for higher values:
BLM_alpha_plot

But when zooming in on lower values:
BLM_alpha_zoom_plot

Lower psi values look fine:
BLM_psi_zoom_plot

I suspect that I am missing some parameter I should change for the variance decomposition to work as expected. Could you maybe hint me which one might control this behaviour?

I also found that several log-likelihoods for movers (around 1/3) are not monotonic and none are monotonic for stayers. Do you maybe have any hints on why this might be the case?

Thank you!

Best,
Martin

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions