jpfeiffe · zengshx777 · Feb 7, 2020 · Feb 12, 2020 · Feb 12, 2020 · Feb 12, 2020
diff --git a/README.md b/README.md
@@ -1,4 +1,3 @@
 # CTRF
-Causal Transfer Random Forests (CTRF)
+# This is an anonymous repository for the paper: "Causal Transfer Random Forest: Combining Logged Data and Randomized Experiments for Robust Prediction". It contains the Supplementary Material "WSDM_Supplementary_Material.jpg" (for anonymous reason, we only keep it as image yet will release the pdf version upon acceptance) and code for reproducing the results. The details are included in the Supplementary Material. We do not make any further changes after submitting the paper.
 
-Simple starting point for CTRF method.
diff --git a/WSDM_Supplmentary_Material.jpg b/WSDM_Supplmentary_Material.jpg
diff --git a/ctrf/auction.py b/ctrf/auction.py
@@ -14,7 +14,14 @@ def run_selection(seed, n_samples, auction_size, n_auctions):
     ind = np.random.randint(0, n_samples, size=auction_size*n_auctions)
     return ind, seed+1
 
-def run_auction(dataset, seed, model, epsilon, auction_size, n_auctions, max_slate):
+def sample_true(x):
+    if any(x):
+        return np.random.choice(np.where(x)[0])
+    else:
+        return np.random.choice(np.where(x==False)[0])
+
+
+def run_auction(dataset, seed, model, epsilon, auction_size, n_auctions, max_slate, position_effect=0):
     seed += 1
     np.random.seed(seed)
 
@@ -67,19 +74,33 @@ def run_auction(dataset, seed, model, epsilon, auction_size, n_auctions, max_sla
         df['Layout'] = df.groupby('AuctionId')['AuctionId'].transform('count')
 
     # Rank by PClick, then cascade to generate clicks
+    # Effect for Position is Zero
     df['Uniform'] = np.random.uniform(size=len(df))
     df['WouldClick'] = np.where(df['Uniform'] <= df['TruePClick'], 1, 0)
     df['Click'] = 0
-    df.loc[df["WouldClick"].ne(0).groupby(df['AuctionId']).idxmax(),'Click']=1
-    df['Click'] = df['Click'] * df['WouldClick']
+    if position_effect==1:
+        df.loc[df["WouldClick"].ne(0).groupby(df['AuctionId']).idxmax(),'Click']=1
+    else:
+        sample_id = df['WouldClick'].ne(0).groupby(df['AuctionId']).apply(sample_true)
+        group_id = df['WouldClick'].ne(0).groupby(df['AuctionId']).groups
+        idx=[]
+        for k,v in group_id.items():
+            idx.append(group_id[k][sample_id[k]])
+        df.loc[idx,'Click']=1
 
+    df['Click'] = df['Click'] * df['WouldClick']
     df.drop(columns=['Uniform', 'WouldClick', 'RankingPClick', 'TruePClick'], inplace=True)
 
+
+
+
     return df, seed+1
 
 def construct_auction_dataset(dataset):
     X = np.hstack((dataset['auctions'][['PClick', 'Position', 'Layout']], dataset['X'][dataset['auctions']['SampleId']]))
-    y = dataset['y'][dataset['auctions']['SampleId']]
+    #y = dataset['y'][dataset['auctions']['SampleId']]
+    y = dataset['auctions']['Click']
+
 
     return X, y
 

diff --git a/ctrf/metrics.py b/ctrf/metrics.py
@@ -1,7 +1,8 @@
 import operator
+import numpy as np
+from sklearn.metrics import roc_auc_score,f1_score,log_loss
 
-from sklearn.metrics import roc_auc_score
-
+###AUC Metric
 def compute_auc(preds, ys):
     preds = sorted(zip(preds, ys), key=operator.itemgetter(0), reverse=True)
     pred_p_te, pred_y_te = zip(*preds)
@@ -11,3 +12,29 @@ def compute_model_auc(model, x_te, y_te):
     x_te = x_te.copy()
     y_te = y_te.copy()
     return compute_auc(model.predict_proba(x_te)[:, 1], y_te)
+
+###F1 Score
+def compute_model_f1(model, x_te, y_te):
+    x_te = x_te.copy()
+    y_te = y_te.copy()
+    return f1_score(model.predict(x_te), y_te)
+
+###Bias
+def compute_model_bias(model, x_te,y_te):
+    x_te = x_te.copy()
+    y_te = y_te.copy()
+    pred_ctr = np.mean(model.predict_proba(x_te)[:, 1])
+    real_ctr = np.mean(y_te)
+    return  abs(pred_ctr-real_ctr)/real_ctr
+
+###RIG
+def compute_model_rig(model, x_te, y_te):
+    x_te = x_te.copy()
+    y_te = y_te.copy()
+    real_ctr = np.mean(y_te)
+    real_entropy = - np.log(real_ctr)*real_ctr- np.log(1-real_ctr)*(1-real_ctr)
+    pred_y = model.predict_proba(x_te)[:, 1]
+    l_score = - log_loss(y_te, pred_y)
+
+    return (real_entropy+l_score)/real_entropy
+
diff --git a/ctrf/models.py b/ctrf/models.py
@@ -1,7 +1,11 @@
 import pandas
 import numpy as np
 import time
-from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier 
+from sklearn.linear_model import LogisticRegression
+# from sklearn import svm
+
+from copy import deepcopy
 
 def train_rf(X, y, seed, **kwargs):
     start = time.time()
@@ -12,15 +16,81 @@ def train_rf(X, y, seed, **kwargs):
     seed += 1
     return model, seed
 
-def train_ctrf(X1, y1, X2, y2, seed, **kwargs):
+def train_combine_rf(X1, y1, X2, y2, seed, **kwargs):
+    start = time.time()
     seed += 1
     model = RandomForestClassifier(criterion='entropy', random_state=seed, **kwargs)
-    model.fit(X1, y1)
+    X_c = np.concatenate([X1, X2], axis=0)
+    y_c = np.concatenate([y1, y2])
+    model.fit(X_c, y_c)
+    print('Runtime:', time.time()-start)
+    seed += 1
+    return model, seed
 
+def train_ctrf(X1, y1, X2, y2, model, combine, seed,**kwargs):
+    seed += 1
+    # model = RandomForestClassifier(criterion='entropy', random_state=seed, **kwargs)
+    # model.fit(X1, y1)
+    #Copy an Oject
+    start = time.time()
+    model = deepcopy(model)
     for e in model.estimators_:
-        df = pandas.DataFrame(zip(e.apply(X2), 1-y2, y2), columns=['LeafId', 'NoClick', 'Click'])
+        if combine==0:
+            df = pandas.DataFrame(zip(e.apply(X2), 1-y2, y2), columns=['LeafId', 'NoClick', 'Click'])
+        else:
+            X_c = np.concatenate([X1,X2],axis=0)
+            y_c = np.concatenate([y1,y2])
+            df = pandas.DataFrame(zip(e.apply(X_c), 1-y_c, y_c), columns=['LeafId', 'NoClick', 'Click'])
         df = df.groupby(['LeafId']).agg(NoClicks=pandas.NamedAgg(column='NoClick', aggfunc='sum'), Clicks=pandas.NamedAgg(column='Click', aggfunc='sum'))
         e.tree_.value[df.index.array] = np.expand_dims(df[['NoClicks', 'Clicks']].values, axis=1)
+    print('Runtime:', time.time()-start)
+    seed += 1
+    return model, seed
+
 
+def calculate_weight(train_X,testing_X):
+    pool_X=np.vstack([train_X,testing_X])
+    pool_Y=np.hstack([np.zeros(train_X.shape[0]),np.ones(testing_X.shape[0])])
+    model=LogisticRegression(solver='liblinear')
+    model.fit(pool_X,pool_Y)
+    pred=model.predict_proba(train_X)
+    weights=pred[:,1]/pred[:,0]
+    weights=weights/np.mean(weights)
+    return weights
+
+def train_lr_model(X,y,seed,**kwargs):
+    start = time.time()
+    seed += 1
+    model = LogisticRegression(solver='liblinear', random_state=seed)
+    model.fit(X,y)
+    print('Runtime:', time.time() - start)
     seed += 1
-    return model, seed
+    return model,seed
+
+def train_gbdt_model(X,y,seed,**kwargs):
+    start = time.time()
+    seed += 1
+    model = GradientBoostingClassifier(random_state=seed)
+    model.fit(X,y)
+    print('Runtime:', time.time() - start)
+    seed += 1
+    return model,seed
+
+def train_lr_weight_model(X,y,weights,seed,**kwargs):
+    start = time.time()
+    seed += 1
+    model = LogisticRegression(solver='liblinear', random_state=seed)
+    model.fit(X,y,sample_weight=weights)
+    print('Runtime:', time.time() - start)
+    seed += 1
+    return model,seed
+
+def train_gbdt_weight_model(X,y,weights,seed,**kwargs):
+    start = time.time()
+    seed += 1
+    model = GradientBoostingClassifier(random_state=seed)
+    model.fit(X,y,sample_weight=weights)
+    print('Runtime:', time.time() - start)
+    seed += 1
+    return model,seed
+
diff --git a/ctrf/util.py b/ctrf/util.py
@@ -0,0 +1,69 @@
+import numpy as np
+def simu_confounding_data(n=2000,p=20,scenario=2,r=0.65,binary=1):
+    simu_data={'Y':[],'S':[],'V':[],'X':[],'r':r,'scenario':scenario}
+
+    p_s=int(p*0.4)
+    p_v=int(p*0.6)
+
+    i_grid=np.linspace(1,p_s,p_s)
+    alpha=(-1)**i_grid*(i_grid%3+1)*p/3
+    beta=p/2
+    count=0
+    while count<n:
+        if scenario==1:
+            S=np.random.normal(size=p_s)
+            V = np.random.normal(size=p_v)
+        elif scenario==2:
+            #S Causes V
+            S=np.random.normal(size=p_s)
+            V = np.zeros(p_v)
+            for j in range(p_v):
+                V[j]=np.random.normal(loc=int(S[j%p_s]>0)+int(S[(j+1)%p_s]>0))
+        else:
+            #V Causes S
+            V=np.random.normal(size=p_v)
+            S = np.zeros(p_s)
+            for j in range(p_s):
+                S[j]=np.random.normal(loc=int(V[j%p_v]>0)+int(V[(j+1)%p_v]>0))
+
+        S_obs=np.zeros(p_s)
+        V_obs=np.zeros(p_v)
+        S_obs[np.where(S>0)]=1
+        V_obs[np.where(V>0)]=1
+
+        logit=np.sum(alpha*S_obs)+(np.sum(S_obs[1:]*S_obs[:p_s-1]))*beta
+
+        Y=1/(1+np.exp(-logit))+np.random.normal(scale=0.2)
+        if binary==1:
+            Y_obs=0
+            if Y>0.5:
+                Y_obs=1
+        else:
+            Y_obs=Y
+
+        noisy_mean=np.mean(V_obs)
+        inclusion=np.random.uniform()
+        ###Positive Correlation
+        if (Y_obs>0.5 and noisy_mean>0.5) or (Y_obs<0.5 and noisy_mean<0.5):
+            if inclusion<r:
+                simu_data['Y'].append(Y_obs)
+                simu_data['S'].append(S_obs)
+                simu_data['V'].append(V_obs)
+                simu_data['X'].append(np.hstack([S_obs,V_obs]))
+                count+=1
+        ###Negative Correlation
+        else:
+            if inclusion<(1-r):
+                simu_data['Y'].append(Y_obs)
+                simu_data['S'].append(S_obs)
+                simu_data['V'].append(V_obs)
+                simu_data['X'].append(np.hstack([S_obs,V_obs]))
+                count+=1
+    #Into Array
+    simu_data['Y']=np.asarray(simu_data['Y'])
+    simu_data['X']=np.asarray(simu_data['X'])
+    simu_data['V']=np.asarray(simu_data['V'])
+    simu_data['S']=np.asarray(simu_data['S'])
+
+    return simu_data
+