Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# CTRF
Causal Transfer Random Forests (CTRF)
# This is an anonymous repository for the paper: "Causal Transfer Random Forest: Combining Logged Data and Randomized Experiments for Robust Prediction". It contains the Supplementary Material "WSDM_Supplementary_Material.jpg" (for anonymous reason, we only keep it as image yet will release the pdf version upon acceptance) and code for reproducing the results. The details are included in the Supplementary Material. We do not make any further changes after submitting the paper.

Simple starting point for CTRF method.
Binary file added WSDM_Supplmentary_Material.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
29 changes: 25 additions & 4 deletions ctrf/auction.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,14 @@ def run_selection(seed, n_samples, auction_size, n_auctions):
ind = np.random.randint(0, n_samples, size=auction_size*n_auctions)
return ind, seed+1

def run_auction(dataset, seed, model, epsilon, auction_size, n_auctions, max_slate):
def sample_true(x):
if any(x):
return np.random.choice(np.where(x)[0])
else:
return np.random.choice(np.where(x==False)[0])


def run_auction(dataset, seed, model, epsilon, auction_size, n_auctions, max_slate, position_effect=0):
seed += 1
np.random.seed(seed)

Expand Down Expand Up @@ -67,19 +74,33 @@ def run_auction(dataset, seed, model, epsilon, auction_size, n_auctions, max_sla
df['Layout'] = df.groupby('AuctionId')['AuctionId'].transform('count')

# Rank by PClick, then cascade to generate clicks
# Effect for Position is Zero
df['Uniform'] = np.random.uniform(size=len(df))
df['WouldClick'] = np.where(df['Uniform'] <= df['TruePClick'], 1, 0)
df['Click'] = 0
df.loc[df["WouldClick"].ne(0).groupby(df['AuctionId']).idxmax(),'Click']=1
df['Click'] = df['Click'] * df['WouldClick']
if position_effect==1:
df.loc[df["WouldClick"].ne(0).groupby(df['AuctionId']).idxmax(),'Click']=1
else:
sample_id = df['WouldClick'].ne(0).groupby(df['AuctionId']).apply(sample_true)
group_id = df['WouldClick'].ne(0).groupby(df['AuctionId']).groups
idx=[]
for k,v in group_id.items():
idx.append(group_id[k][sample_id[k]])
df.loc[idx,'Click']=1

df['Click'] = df['Click'] * df['WouldClick']
df.drop(columns=['Uniform', 'WouldClick', 'RankingPClick', 'TruePClick'], inplace=True)




return df, seed+1

def construct_auction_dataset(dataset):
X = np.hstack((dataset['auctions'][['PClick', 'Position', 'Layout']], dataset['X'][dataset['auctions']['SampleId']]))
y = dataset['y'][dataset['auctions']['SampleId']]
#y = dataset['y'][dataset['auctions']['SampleId']]
y = dataset['auctions']['Click']


return X, y

Expand Down
31 changes: 29 additions & 2 deletions ctrf/metrics.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import operator
import numpy as np
from sklearn.metrics import roc_auc_score,f1_score,log_loss

from sklearn.metrics import roc_auc_score

###AUC Metric
def compute_auc(preds, ys):
preds = sorted(zip(preds, ys), key=operator.itemgetter(0), reverse=True)
pred_p_te, pred_y_te = zip(*preds)
Expand All @@ -11,3 +12,29 @@ def compute_model_auc(model, x_te, y_te):
x_te = x_te.copy()
y_te = y_te.copy()
return compute_auc(model.predict_proba(x_te)[:, 1], y_te)

###F1 Score
def compute_model_f1(model, x_te, y_te):
x_te = x_te.copy()
y_te = y_te.copy()
return f1_score(model.predict(x_te), y_te)

###Bias
def compute_model_bias(model, x_te,y_te):
x_te = x_te.copy()
y_te = y_te.copy()
pred_ctr = np.mean(model.predict_proba(x_te)[:, 1])
real_ctr = np.mean(y_te)
return abs(pred_ctr-real_ctr)/real_ctr

###RIG
def compute_model_rig(model, x_te, y_te):
x_te = x_te.copy()
y_te = y_te.copy()
real_ctr = np.mean(y_te)
real_entropy = - np.log(real_ctr)*real_ctr- np.log(1-real_ctr)*(1-real_ctr)
pred_y = model.predict_proba(x_te)[:, 1]
l_score = - log_loss(y_te, pred_y)

return (real_entropy+l_score)/real_entropy

80 changes: 75 additions & 5 deletions ctrf/models.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import pandas
import numpy as np
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
# from sklearn import svm

from copy import deepcopy

def train_rf(X, y, seed, **kwargs):
start = time.time()
Expand All @@ -12,15 +16,81 @@ def train_rf(X, y, seed, **kwargs):
seed += 1
return model, seed

def train_ctrf(X1, y1, X2, y2, seed, **kwargs):
def train_combine_rf(X1, y1, X2, y2, seed, **kwargs):
start = time.time()
seed += 1
model = RandomForestClassifier(criterion='entropy', random_state=seed, **kwargs)
model.fit(X1, y1)
X_c = np.concatenate([X1, X2], axis=0)
y_c = np.concatenate([y1, y2])
model.fit(X_c, y_c)
print('Runtime:', time.time()-start)
seed += 1
return model, seed

def train_ctrf(X1, y1, X2, y2, model, combine, seed,**kwargs):
seed += 1
# model = RandomForestClassifier(criterion='entropy', random_state=seed, **kwargs)
# model.fit(X1, y1)
#Copy an Oject
start = time.time()
model = deepcopy(model)
for e in model.estimators_:
df = pandas.DataFrame(zip(e.apply(X2), 1-y2, y2), columns=['LeafId', 'NoClick', 'Click'])
if combine==0:
df = pandas.DataFrame(zip(e.apply(X2), 1-y2, y2), columns=['LeafId', 'NoClick', 'Click'])
else:
X_c = np.concatenate([X1,X2],axis=0)
y_c = np.concatenate([y1,y2])
df = pandas.DataFrame(zip(e.apply(X_c), 1-y_c, y_c), columns=['LeafId', 'NoClick', 'Click'])
df = df.groupby(['LeafId']).agg(NoClicks=pandas.NamedAgg(column='NoClick', aggfunc='sum'), Clicks=pandas.NamedAgg(column='Click', aggfunc='sum'))
e.tree_.value[df.index.array] = np.expand_dims(df[['NoClicks', 'Clicks']].values, axis=1)
print('Runtime:', time.time()-start)
seed += 1
return model, seed


def calculate_weight(train_X,testing_X):
pool_X=np.vstack([train_X,testing_X])
pool_Y=np.hstack([np.zeros(train_X.shape[0]),np.ones(testing_X.shape[0])])
model=LogisticRegression(solver='liblinear')
model.fit(pool_X,pool_Y)
pred=model.predict_proba(train_X)
weights=pred[:,1]/pred[:,0]
weights=weights/np.mean(weights)
return weights

def train_lr_model(X,y,seed,**kwargs):
start = time.time()
seed += 1
model = LogisticRegression(solver='liblinear', random_state=seed)
model.fit(X,y)
print('Runtime:', time.time() - start)
seed += 1
return model, seed
return model,seed

def train_gbdt_model(X,y,seed,**kwargs):
start = time.time()
seed += 1
model = GradientBoostingClassifier(random_state=seed)
model.fit(X,y)
print('Runtime:', time.time() - start)
seed += 1
return model,seed

def train_lr_weight_model(X,y,weights,seed,**kwargs):
start = time.time()
seed += 1
model = LogisticRegression(solver='liblinear', random_state=seed)
model.fit(X,y,sample_weight=weights)
print('Runtime:', time.time() - start)
seed += 1
return model,seed

def train_gbdt_weight_model(X,y,weights,seed,**kwargs):
start = time.time()
seed += 1
model = GradientBoostingClassifier(random_state=seed)
model.fit(X,y,sample_weight=weights)
print('Runtime:', time.time() - start)
seed += 1
return model,seed

69 changes: 69 additions & 0 deletions ctrf/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import numpy as np
def simu_confounding_data(n=2000,p=20,scenario=2,r=0.65,binary=1):
simu_data={'Y':[],'S':[],'V':[],'X':[],'r':r,'scenario':scenario}

p_s=int(p*0.4)
p_v=int(p*0.6)

i_grid=np.linspace(1,p_s,p_s)
alpha=(-1)**i_grid*(i_grid%3+1)*p/3
beta=p/2
count=0
while count<n:
if scenario==1:
S=np.random.normal(size=p_s)
V = np.random.normal(size=p_v)
elif scenario==2:
#S Causes V
S=np.random.normal(size=p_s)
V = np.zeros(p_v)
for j in range(p_v):
V[j]=np.random.normal(loc=int(S[j%p_s]>0)+int(S[(j+1)%p_s]>0))
else:
#V Causes S
V=np.random.normal(size=p_v)
S = np.zeros(p_s)
for j in range(p_s):
S[j]=np.random.normal(loc=int(V[j%p_v]>0)+int(V[(j+1)%p_v]>0))

S_obs=np.zeros(p_s)
V_obs=np.zeros(p_v)
S_obs[np.where(S>0)]=1
V_obs[np.where(V>0)]=1

logit=np.sum(alpha*S_obs)+(np.sum(S_obs[1:]*S_obs[:p_s-1]))*beta

Y=1/(1+np.exp(-logit))+np.random.normal(scale=0.2)
if binary==1:
Y_obs=0
if Y>0.5:
Y_obs=1
else:
Y_obs=Y

noisy_mean=np.mean(V_obs)
inclusion=np.random.uniform()
###Positive Correlation
if (Y_obs>0.5 and noisy_mean>0.5) or (Y_obs<0.5 and noisy_mean<0.5):
if inclusion<r:
simu_data['Y'].append(Y_obs)
simu_data['S'].append(S_obs)
simu_data['V'].append(V_obs)
simu_data['X'].append(np.hstack([S_obs,V_obs]))
count+=1
###Negative Correlation
else:
if inclusion<(1-r):
simu_data['Y'].append(Y_obs)
simu_data['S'].append(S_obs)
simu_data['V'].append(V_obs)
simu_data['X'].append(np.hstack([S_obs,V_obs]))
count+=1
#Into Array
simu_data['Y']=np.asarray(simu_data['Y'])
simu_data['X']=np.asarray(simu_data['X'])
simu_data['V']=np.asarray(simu_data['V'])
simu_data['S']=np.asarray(simu_data['S'])

return simu_data

Loading