diff --git a/README.md b/README.md index 71aabd3..c44a4e2 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,3 @@ # CTRF -Causal Transfer Random Forests (CTRF) +# This is an anonymous repository for the paper: "Causal Transfer Random Forest: Combining Logged Data and Randomized Experiments for Robust Prediction". It contains the Supplementary Material "WSDM_Supplementary_Material.jpg" (for anonymous reason, we only keep it as image yet will release the pdf version upon acceptance) and code for reproducing the results. The details are included in the Supplementary Material. We do not make any further changes after submitting the paper. -Simple starting point for CTRF method. \ No newline at end of file diff --git a/WSDM_Supplmentary_Material.jpg b/WSDM_Supplmentary_Material.jpg new file mode 100644 index 0000000..9d022a7 Binary files /dev/null and b/WSDM_Supplmentary_Material.jpg differ diff --git a/ctrf/auction.py b/ctrf/auction.py index 13bd409..edbfd29 100644 --- a/ctrf/auction.py +++ b/ctrf/auction.py @@ -14,7 +14,14 @@ def run_selection(seed, n_samples, auction_size, n_auctions): ind = np.random.randint(0, n_samples, size=auction_size*n_auctions) return ind, seed+1 -def run_auction(dataset, seed, model, epsilon, auction_size, n_auctions, max_slate): +def sample_true(x): + if any(x): + return np.random.choice(np.where(x)[0]) + else: + return np.random.choice(np.where(x==False)[0]) + + +def run_auction(dataset, seed, model, epsilon, auction_size, n_auctions, max_slate, position_effect=0): seed += 1 np.random.seed(seed) @@ -67,19 +74,33 @@ def run_auction(dataset, seed, model, epsilon, auction_size, n_auctions, max_sla df['Layout'] = df.groupby('AuctionId')['AuctionId'].transform('count') # Rank by PClick, then cascade to generate clicks + # Effect for Position is Zero df['Uniform'] = np.random.uniform(size=len(df)) df['WouldClick'] = np.where(df['Uniform'] <= df['TruePClick'], 1, 0) df['Click'] = 0 - df.loc[df["WouldClick"].ne(0).groupby(df['AuctionId']).idxmax(),'Click']=1 - df['Click'] = df['Click'] * df['WouldClick'] + if position_effect==1: + df.loc[df["WouldClick"].ne(0).groupby(df['AuctionId']).idxmax(),'Click']=1 + else: + sample_id = df['WouldClick'].ne(0).groupby(df['AuctionId']).apply(sample_true) + group_id = df['WouldClick'].ne(0).groupby(df['AuctionId']).groups + idx=[] + for k,v in group_id.items(): + idx.append(group_id[k][sample_id[k]]) + df.loc[idx,'Click']=1 + df['Click'] = df['Click'] * df['WouldClick'] df.drop(columns=['Uniform', 'WouldClick', 'RankingPClick', 'TruePClick'], inplace=True) + + + return df, seed+1 def construct_auction_dataset(dataset): X = np.hstack((dataset['auctions'][['PClick', 'Position', 'Layout']], dataset['X'][dataset['auctions']['SampleId']])) - y = dataset['y'][dataset['auctions']['SampleId']] + #y = dataset['y'][dataset['auctions']['SampleId']] + y = dataset['auctions']['Click'] + return X, y diff --git a/ctrf/metrics.py b/ctrf/metrics.py index 471e210..fd2cc50 100644 --- a/ctrf/metrics.py +++ b/ctrf/metrics.py @@ -1,7 +1,8 @@ import operator +import numpy as np +from sklearn.metrics import roc_auc_score,f1_score,log_loss -from sklearn.metrics import roc_auc_score - +###AUC Metric def compute_auc(preds, ys): preds = sorted(zip(preds, ys), key=operator.itemgetter(0), reverse=True) pred_p_te, pred_y_te = zip(*preds) @@ -11,3 +12,29 @@ def compute_model_auc(model, x_te, y_te): x_te = x_te.copy() y_te = y_te.copy() return compute_auc(model.predict_proba(x_te)[:, 1], y_te) + +###F1 Score +def compute_model_f1(model, x_te, y_te): + x_te = x_te.copy() + y_te = y_te.copy() + return f1_score(model.predict(x_te), y_te) + +###Bias +def compute_model_bias(model, x_te,y_te): + x_te = x_te.copy() + y_te = y_te.copy() + pred_ctr = np.mean(model.predict_proba(x_te)[:, 1]) + real_ctr = np.mean(y_te) + return abs(pred_ctr-real_ctr)/real_ctr + +###RIG +def compute_model_rig(model, x_te, y_te): + x_te = x_te.copy() + y_te = y_te.copy() + real_ctr = np.mean(y_te) + real_entropy = - np.log(real_ctr)*real_ctr- np.log(1-real_ctr)*(1-real_ctr) + pred_y = model.predict_proba(x_te)[:, 1] + l_score = - log_loss(y_te, pred_y) + + return (real_entropy+l_score)/real_entropy + diff --git a/ctrf/models.py b/ctrf/models.py index 24faa5a..88414c2 100644 --- a/ctrf/models.py +++ b/ctrf/models.py @@ -1,7 +1,11 @@ import pandas import numpy as np import time -from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier +from sklearn.linear_model import LogisticRegression +# from sklearn import svm + +from copy import deepcopy def train_rf(X, y, seed, **kwargs): start = time.time() @@ -12,15 +16,81 @@ def train_rf(X, y, seed, **kwargs): seed += 1 return model, seed -def train_ctrf(X1, y1, X2, y2, seed, **kwargs): +def train_combine_rf(X1, y1, X2, y2, seed, **kwargs): + start = time.time() seed += 1 model = RandomForestClassifier(criterion='entropy', random_state=seed, **kwargs) - model.fit(X1, y1) + X_c = np.concatenate([X1, X2], axis=0) + y_c = np.concatenate([y1, y2]) + model.fit(X_c, y_c) + print('Runtime:', time.time()-start) + seed += 1 + return model, seed +def train_ctrf(X1, y1, X2, y2, model, combine, seed,**kwargs): + seed += 1 + # model = RandomForestClassifier(criterion='entropy', random_state=seed, **kwargs) + # model.fit(X1, y1) + #Copy an Oject + start = time.time() + model = deepcopy(model) for e in model.estimators_: - df = pandas.DataFrame(zip(e.apply(X2), 1-y2, y2), columns=['LeafId', 'NoClick', 'Click']) + if combine==0: + df = pandas.DataFrame(zip(e.apply(X2), 1-y2, y2), columns=['LeafId', 'NoClick', 'Click']) + else: + X_c = np.concatenate([X1,X2],axis=0) + y_c = np.concatenate([y1,y2]) + df = pandas.DataFrame(zip(e.apply(X_c), 1-y_c, y_c), columns=['LeafId', 'NoClick', 'Click']) df = df.groupby(['LeafId']).agg(NoClicks=pandas.NamedAgg(column='NoClick', aggfunc='sum'), Clicks=pandas.NamedAgg(column='Click', aggfunc='sum')) e.tree_.value[df.index.array] = np.expand_dims(df[['NoClicks', 'Clicks']].values, axis=1) + print('Runtime:', time.time()-start) + seed += 1 + return model, seed + +def calculate_weight(train_X,testing_X): + pool_X=np.vstack([train_X,testing_X]) + pool_Y=np.hstack([np.zeros(train_X.shape[0]),np.ones(testing_X.shape[0])]) + model=LogisticRegression(solver='liblinear') + model.fit(pool_X,pool_Y) + pred=model.predict_proba(train_X) + weights=pred[:,1]/pred[:,0] + weights=weights/np.mean(weights) + return weights + +def train_lr_model(X,y,seed,**kwargs): + start = time.time() + seed += 1 + model = LogisticRegression(solver='liblinear', random_state=seed) + model.fit(X,y) + print('Runtime:', time.time() - start) seed += 1 - return model, seed \ No newline at end of file + return model,seed + +def train_gbdt_model(X,y,seed,**kwargs): + start = time.time() + seed += 1 + model = GradientBoostingClassifier(random_state=seed) + model.fit(X,y) + print('Runtime:', time.time() - start) + seed += 1 + return model,seed + +def train_lr_weight_model(X,y,weights,seed,**kwargs): + start = time.time() + seed += 1 + model = LogisticRegression(solver='liblinear', random_state=seed) + model.fit(X,y,sample_weight=weights) + print('Runtime:', time.time() - start) + seed += 1 + return model,seed + +def train_gbdt_weight_model(X,y,weights,seed,**kwargs): + start = time.time() + seed += 1 + model = GradientBoostingClassifier(random_state=seed) + model.fit(X,y,sample_weight=weights) + print('Runtime:', time.time() - start) + seed += 1 + return model,seed + diff --git a/ctrf/util.py b/ctrf/util.py new file mode 100644 index 0000000..dbf197a --- /dev/null +++ b/ctrf/util.py @@ -0,0 +1,69 @@ +import numpy as np +def simu_confounding_data(n=2000,p=20,scenario=2,r=0.65,binary=1): + simu_data={'Y':[],'S':[],'V':[],'X':[],'r':r,'scenario':scenario} + + p_s=int(p*0.4) + p_v=int(p*0.6) + + i_grid=np.linspace(1,p_s,p_s) + alpha=(-1)**i_grid*(i_grid%3+1)*p/3 + beta=p/2 + count=0 + while count0)+int(S[(j+1)%p_s]>0)) + else: + #V Causes S + V=np.random.normal(size=p_v) + S = np.zeros(p_s) + for j in range(p_s): + S[j]=np.random.normal(loc=int(V[j%p_v]>0)+int(V[(j+1)%p_v]>0)) + + S_obs=np.zeros(p_s) + V_obs=np.zeros(p_v) + S_obs[np.where(S>0)]=1 + V_obs[np.where(V>0)]=1 + + logit=np.sum(alpha*S_obs)+(np.sum(S_obs[1:]*S_obs[:p_s-1]))*beta + + Y=1/(1+np.exp(-logit))+np.random.normal(scale=0.2) + if binary==1: + Y_obs=0 + if Y>0.5: + Y_obs=1 + else: + Y_obs=Y + + noisy_mean=np.mean(V_obs) + inclusion=np.random.uniform() + ###Positive Correlation + if (Y_obs>0.5 and noisy_mean>0.5) or (Y_obs<0.5 and noisy_mean<0.5): + if inclusion