Merge pull request #31 from UChicagoSUPERgroup/proxy_col_changes

Proxy col changes
UChicagoSUPERgroup · Jun 28, 2022 · c6414bf · c6414bf
2 parents 572b5f5 + f8297d8
commit c6414bf
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 31 deletions.
diff --git a/jupyter_note_book_plugin/prompt-ml/src/notifier.ts b/jupyter_note_book_plugin/prompt-ml/src/notifier.ts
@@ -297,11 +297,12 @@ export class Prompter {
     for (var x = 0; x < proxies.length; x++) {
       var p: any = proxies[x];
       if (!(p["df"] in d))
-        d[p["df"]] = { proxy_col_name: [], sensitive_col_name: [], p_vals: [],  coeff: []};
+        d[p["df"]] = { proxy_col_name: [], sensitive_col_name: [], p_vals: [],  coeff: [], stat_name : []};
       d[p["df"]]["proxy_col_name"].push(p["proxy_col_name"]);
       d[p["df"]]["sensitive_col_name"].push(p["sensitive_col_name"]);
       d[p["df"]]["p_vals"].push(p["p"]);
       d[p["df"]]["coeff"].push(p["coefficient"]);
+      d[p["df"]]["stat_name"].push(p["stat_name"]);
     }
     var message = this._makeProxyMsg(d);
     this._appendNote(message);
@@ -328,7 +329,6 @@ export class Prompter {
   private _makeProxyMsg(d: any) {
     var note = new PopupNotification("proxy", false, "Proxy Columns", d);
     note.addHeader("Proxy Columns");
-
     for (let df_name in d) {
       note.addHeader(`Within <span class="code-snippet">${df_name}</span></strong>`);
       var df = d[df_name];
@@ -342,7 +342,7 @@ export class Prompter {
           };
         }
 
-        var col_name = `${df["proxy_col_name"][idx]} (${df["coeff"][idx]})`;
+        var col_name = `${df["proxy_col_name"][idx]} (${df["stat_name"][idx]} = ${df["coeff"][idx]})`;
 
         if (df["p_vals"][idx] < 0.001) {
           tableRows[columnName].correlated.push(col_name);

diff --git a/jupyter_note_book_plugin/serverextension/prompter/notifications.py b/jupyter_note_book_plugin/serverextension/prompter/notifications.py
@@ -10,7 +10,6 @@
 
 import math
 import operator
-from ssl import VERIFY_X509_TRUSTED_FIRST
 from tokenize import group
 
 import pandas as pd
@@ -396,34 +395,26 @@ def _test_combo(self, df, sens_col, not_sense_col):
             return None
         if sens_col_type == "categorical" and not_sense_col_type == "numeric":
             coeff,p = self._apply_ANOVA(df, sens_col, not_sense_col)
-            if pd.isna(coeff):
-                raise Exception(f"na coeff {sens_col} {not_sense_col}")
-            if p < PVAL_CUTOFF:
-                return {"sensitive_col_name" : sens_col, 
+            if not pd.isna(coeff) and p < PVAL_CUTOFF:
+                return {"sensitive_col_name" : sens_col, "stat_name" : "F",
                         "proxy_col_name" : not_sense_col, "p" : p,
                         "coefficient" : round(coeff, 2)}
         if sens_col_type == "categorical" and not_sense_col_type == "categorical":
             coeff,p = self._apply_chisq(df, sens_col, not_sense_col)
-            if pd.isna(coeff):
-                raise Exception(f"na coeff {sens_col} {not_sense_col}")
-            if p < PVAL_CUTOFF:
-                return {"sensitive_col_name" : sens_col,
+            if not pd.isna(coeff) and p < PVAL_CUTOFF:
+                return {"sensitive_col_name" : sens_col, "stat_name" : "Chisq",
                         "proxy_col_name" : not_sense_col, "p" : p,
                         "coefficient" : round(coeff, 2)}
         if sens_col_type == "numeric" and not_sense_col_type == "numeric":
             coeff,p = self._apply_spearman(df, sens_col, not_sense_col)
-            if pd.isna(coeff):
-                raise Exception(f"na coeff {sens_col} {not_sense_col}")
-            if p < PVAL_CUTOFF:
-                return {"sensitive_col_name" : sens_col,
+            if not pd.isna(coeff) and p < PVAL_CUTOFF:
+                return {"sensitive_col_name" : sens_col, "stat_name" : "Rho",
                         "proxy_col_name" : not_sense_col, "p" : p,
                         "coefficient" : round(coeff, 2)} 
         if sens_col_type == "numeric" and not_sense_col_type == "categorical":
             coeff,p = self._apply_ANOVA(df, not_sense_col, sens_col)
-            if pd.isna(coeff):
-                raise Exception(f"na coeff {sens_col} {not_sense_col}")
-            if p < PVAL_CUTOFF:
-                return {"sensitive_col_name" : sens_col,
+            if not pd.isna(coeff) and p < PVAL_CUTOFF:
+                return {"sensitive_col_name" : sens_col, "stat_name" : "F",
                         "proxy_col_name" : not_sense_col, "p" : p,
                         "coefficient" : round(coeff, 2)} 
         return None
@@ -510,29 +501,22 @@ def _apply_ANOVA(self, df, sense_col, num_col):
         sense_col_values = df[sense_col].dropna().unique()
 
         if len(df[num_col].dropna().unique()) < 2: # f test is not defined if values are uniform
-            return 1.0
+            return None,1.0
         if len(sense_col_values) < 2:
-            return 1.0
+            return None,1.0
         value_cols = [df[num_col][df[sense_col] == v].dropna() for v in sense_col_values]
 
-        total_var = df[num_col].var(ddof=0, skipna=True)
-        total_mean = df[num_col].mean()
-
-        corr_ratio_num = sum([len(subset)*(subset.mean() - total_mean)**2 for subset in value_cols if len(subset) > 0])
-        corr_ratio = np.sqrt((corr_ratio_num/len(df[num_col]))/total_var)
-
         result = f_oneway(*value_cols)
 
-        return corr_ratio,result[1] # this returns the p-value
+        return result[0],result[1] # this returns the p-value
 
     def _apply_chisq(self, df, sense_col, cat_col):
         # pylint: disable=no-self-use
         # contingency table
         table = pd.crosstab(df[sense_col], df[cat_col])
         result = chi2_contingency(table.to_numpy())
-        coeff = association(table.to_numpy(), method="cramer")
 
-        return coeff,result[1] # returns the p-value 
+        return result[0],result[1] # returns the p-value 
 
     def _apply_spearman(self, df, sens_col, not_sens_col):
         # pylint: disable=no-self-use