Skip to content

Commit

Permalink
Merge pull request #31 from UChicagoSUPERgroup/proxy_col_changes
Browse files Browse the repository at this point in the history
Proxy col changes
  • Loading branch information
samgalen authored Jun 28, 2022
2 parents 572b5f5 + f8297d8 commit c6414bf
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 31 deletions.
6 changes: 3 additions & 3 deletions jupyter_note_book_plugin/prompt-ml/src/notifier.ts
Original file line number Diff line number Diff line change
Expand Up @@ -297,11 +297,12 @@ export class Prompter {
for (var x = 0; x < proxies.length; x++) {
var p: any = proxies[x];
if (!(p["df"] in d))
d[p["df"]] = { proxy_col_name: [], sensitive_col_name: [], p_vals: [], coeff: []};
d[p["df"]] = { proxy_col_name: [], sensitive_col_name: [], p_vals: [], coeff: [], stat_name : []};
d[p["df"]]["proxy_col_name"].push(p["proxy_col_name"]);
d[p["df"]]["sensitive_col_name"].push(p["sensitive_col_name"]);
d[p["df"]]["p_vals"].push(p["p"]);
d[p["df"]]["coeff"].push(p["coefficient"]);
d[p["df"]]["stat_name"].push(p["stat_name"]);
}
var message = this._makeProxyMsg(d);
this._appendNote(message);
Expand All @@ -328,7 +329,6 @@ export class Prompter {
private _makeProxyMsg(d: any) {
var note = new PopupNotification("proxy", false, "Proxy Columns", d);
note.addHeader("Proxy Columns");

for (let df_name in d) {
note.addHeader(`Within <span class="code-snippet">${df_name}</span></strong>`);
var df = d[df_name];
Expand All @@ -342,7 +342,7 @@ export class Prompter {
};
}

var col_name = `${df["proxy_col_name"][idx]} (${df["coeff"][idx]})`;
var col_name = `${df["proxy_col_name"][idx]} (${df["stat_name"][idx]} = ${df["coeff"][idx]})`;

if (df["p_vals"][idx] < 0.001) {
tableRows[columnName].correlated.push(col_name);
Expand Down
40 changes: 12 additions & 28 deletions jupyter_note_book_plugin/serverextension/prompter/notifications.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

import math
import operator
from ssl import VERIFY_X509_TRUSTED_FIRST
from tokenize import group

import pandas as pd
Expand Down Expand Up @@ -396,34 +395,26 @@ def _test_combo(self, df, sens_col, not_sense_col):
return None
if sens_col_type == "categorical" and not_sense_col_type == "numeric":
coeff,p = self._apply_ANOVA(df, sens_col, not_sense_col)
if pd.isna(coeff):
raise Exception(f"na coeff {sens_col} {not_sense_col}")
if p < PVAL_CUTOFF:
return {"sensitive_col_name" : sens_col,
if not pd.isna(coeff) and p < PVAL_CUTOFF:
return {"sensitive_col_name" : sens_col, "stat_name" : "F",
"proxy_col_name" : not_sense_col, "p" : p,
"coefficient" : round(coeff, 2)}
if sens_col_type == "categorical" and not_sense_col_type == "categorical":
coeff,p = self._apply_chisq(df, sens_col, not_sense_col)
if pd.isna(coeff):
raise Exception(f"na coeff {sens_col} {not_sense_col}")
if p < PVAL_CUTOFF:
return {"sensitive_col_name" : sens_col,
if not pd.isna(coeff) and p < PVAL_CUTOFF:
return {"sensitive_col_name" : sens_col, "stat_name" : "Chisq",
"proxy_col_name" : not_sense_col, "p" : p,
"coefficient" : round(coeff, 2)}
if sens_col_type == "numeric" and not_sense_col_type == "numeric":
coeff,p = self._apply_spearman(df, sens_col, not_sense_col)
if pd.isna(coeff):
raise Exception(f"na coeff {sens_col} {not_sense_col}")
if p < PVAL_CUTOFF:
return {"sensitive_col_name" : sens_col,
if not pd.isna(coeff) and p < PVAL_CUTOFF:
return {"sensitive_col_name" : sens_col, "stat_name" : "Rho",
"proxy_col_name" : not_sense_col, "p" : p,
"coefficient" : round(coeff, 2)}
if sens_col_type == "numeric" and not_sense_col_type == "categorical":
coeff,p = self._apply_ANOVA(df, not_sense_col, sens_col)
if pd.isna(coeff):
raise Exception(f"na coeff {sens_col} {not_sense_col}")
if p < PVAL_CUTOFF:
return {"sensitive_col_name" : sens_col,
if not pd.isna(coeff) and p < PVAL_CUTOFF:
return {"sensitive_col_name" : sens_col, "stat_name" : "F",
"proxy_col_name" : not_sense_col, "p" : p,
"coefficient" : round(coeff, 2)}
return None
Expand Down Expand Up @@ -510,29 +501,22 @@ def _apply_ANOVA(self, df, sense_col, num_col):
sense_col_values = df[sense_col].dropna().unique()

if len(df[num_col].dropna().unique()) < 2: # f test is not defined if values are uniform
return 1.0
return None,1.0
if len(sense_col_values) < 2:
return 1.0
return None,1.0
value_cols = [df[num_col][df[sense_col] == v].dropna() for v in sense_col_values]

total_var = df[num_col].var(ddof=0, skipna=True)
total_mean = df[num_col].mean()

corr_ratio_num = sum([len(subset)*(subset.mean() - total_mean)**2 for subset in value_cols if len(subset) > 0])
corr_ratio = np.sqrt((corr_ratio_num/len(df[num_col]))/total_var)

result = f_oneway(*value_cols)

return corr_ratio,result[1] # this returns the p-value
return result[0],result[1] # this returns the p-value

def _apply_chisq(self, df, sense_col, cat_col):
# pylint: disable=no-self-use
# contingency table
table = pd.crosstab(df[sense_col], df[cat_col])
result = chi2_contingency(table.to_numpy())
coeff = association(table.to_numpy(), method="cramer")

return coeff,result[1] # returns the p-value
return result[0],result[1] # returns the p-value

def _apply_spearman(self, df, sens_col, not_sens_col):
# pylint: disable=no-self-use
Expand Down

0 comments on commit c6414bf

Please sign in to comment.