Skip to content

Feature/predicted uplift viz #94

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 29 additions & 20 deletions sklift/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ def response_rate_by_percentile(y_true, uplift, treatment, group, strategy='over
sorted by uplift predictions. Then the difference between these conversions is calculated.

bins (int): Determines the number of bins (and relative percentile) in the data. Default is 10.

Returns:
array (shape = [>2]), array (shape = [>2]), array (shape = [>2]):
response rate at each percentile for control or treatment group,
Expand All @@ -443,44 +443,46 @@ def response_rate_by_percentile(y_true, uplift, treatment, group, strategy='over

group_types = ['treatment', 'control']
strategy_methods = ['overall', 'by_group']

n_samples = len(y_true)

if group not in group_types:
raise ValueError(f'Response rate supports only group types in {group_types},'
f' got {group}.')
f' got {group}.')

if strategy not in strategy_methods:
raise ValueError(f'Response rate supports only calculating methods in {strategy_methods},'
f' got {strategy}.')

if not isinstance(bins, int) or bins <= 0:
raise ValueError(f'Bins should be positive integer. Invalid value bins: {bins}')

if bins >= n_samples:
raise ValueError(f'Number of bins = {bins} should be smaller than the length of y_true {n_samples}')

y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array(treatment)
order = np.argsort(uplift, kind='mergesort')[::-1]
predicted_uplift_bin = np.array_split(uplift[order], bins)
mean_predicted_uplift_by_bin = np.array([np.mean(current_bin_uplft) for current_bin_uplft in predicted_uplift_bin])

trmnt_flag = 1 if group == 'treatment' else 0

if strategy == 'overall':
y_true_bin = np.array_split(y_true[order], bins)
trmnt_bin = np.array_split(treatment[order], bins)

group_size = np.array([len(y[trmnt == trmnt_flag]) for y, trmnt in zip(y_true_bin, trmnt_bin)])
response_rate = np.array([np.mean(y[trmnt == trmnt_flag]) for y, trmnt in zip(y_true_bin, trmnt_bin)])

else: # strategy == 'by_group'
y_bin = np.array_split(y_true[order][treatment[order] == trmnt_flag], bins)

group_size = np.array([len(y) for y in y_bin])
response_rate = np.array([np.mean(y) for y in y_bin])

variance = np.multiply(response_rate, np.divide((1 - response_rate), group_size))

return response_rate, variance, group_size
return response_rate, variance, group_size, mean_predicted_uplift_by_bin


def weighted_average_uplift(y_true, uplift, treatment, strategy='overall', bins=10):
Expand Down Expand Up @@ -527,10 +529,10 @@ def weighted_average_uplift(y_true, uplift, treatment, strategy='overall', bins=
if bins >= n_samples:
raise ValueError(f'Number of bins = {bins} should be smaller than the length of y_true {n_samples}')

response_rate_trmnt, variance_trmnt, n_trmnt = response_rate_by_percentile(
response_rate_trmnt, variance_trmnt, n_trmnt, *_ = response_rate_by_percentile(
y_true, uplift, treatment, group='treatment', strategy=strategy, bins=bins)

response_rate_ctrl, variance_ctrl, n_ctrl = response_rate_by_percentile(
response_rate_ctrl, variance_ctrl, n_ctrl, *_ = response_rate_by_percentile(
y_true, uplift, treatment, group='control', strategy=strategy, bins=bins)

uplift_scores = response_rate_trmnt - response_rate_ctrl
Expand All @@ -541,7 +543,7 @@ def weighted_average_uplift(y_true, uplift, treatment, strategy='overall', bins=


def uplift_by_percentile(y_true, uplift, treatment, strategy='overall',
bins=10, std=False, total=False, string_percentiles=True):
bins=10, std=False, total=False, string_percentiles=True, add_predicted_uplift: bool = False):
"""Compute metrics: uplift, group size, group response rate, standard deviation at each percentile.

Metrics in columns and percentiles in rows of pandas DataFrame:
Expand Down Expand Up @@ -573,7 +575,8 @@ def uplift_by_percentile(y_true, uplift, treatment, strategy='overall',
The total response rate is a response rate on the full data amount.
bins (int): Determines the number of bins (and the relative percentile) in the data. Default is 10.
string_percentiles (bool): type of percentiles in the index: float or string. Default is True (string).

add_predicted_uplift (bool): if True, concats another column to resulting dataframe
(column with mean predicted uplift in bin)
Returns:
pandas.DataFrame: DataFrame where metrics are by columns and percentiles are by rows.
"""
Expand Down Expand Up @@ -610,10 +613,10 @@ def uplift_by_percentile(y_true, uplift, treatment, strategy='overall',

y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array(treatment)

response_rate_trmnt, variance_trmnt, n_trmnt = response_rate_by_percentile(
response_rate_trmnt, variance_trmnt, n_trmnt, mean_predicted_uplift = response_rate_by_percentile(
y_true, uplift, treatment, group='treatment', strategy=strategy, bins=bins)

response_rate_ctrl, variance_ctrl, n_ctrl = response_rate_by_percentile(
response_rate_ctrl, variance_ctrl, n_ctrl, *_ = response_rate_by_percentile(
y_true, uplift, treatment, group='control', strategy=strategy, bins=bins)

uplift_scores = response_rate_trmnt - response_rate_ctrl
Expand All @@ -623,8 +626,7 @@ def uplift_by_percentile(y_true, uplift, treatment, strategy='overall',

if string_percentiles:
percentiles = [f"0-{percentiles[0]}"] + \
[f"{percentiles[i]}-{percentiles[i + 1]}" for i in range(len(percentiles) - 1)]

[f"{percentiles[i]}-{percentiles[i + 1]}" for i in range(len(percentiles) - 1)]

df = pd.DataFrame({
'percentile': percentiles,
Expand All @@ -636,10 +638,10 @@ def uplift_by_percentile(y_true, uplift, treatment, strategy='overall',
})

if total:
response_rate_trmnt_total, variance_trmnt_total, n_trmnt_total = response_rate_by_percentile(
response_rate_trmnt_total, variance_trmnt_total, n_trmnt_total, *_ = response_rate_by_percentile(
y_true, uplift, treatment, strategy=strategy, group='treatment', bins=1)

response_rate_ctrl_total, variance_ctrl_total, n_ctrl_total = response_rate_by_percentile(
response_rate_ctrl_total, variance_ctrl_total, n_ctrl_total, *_ = response_rate_by_percentile(
y_true, uplift, treatment, strategy=strategy, group='control', bins=1)

df.loc[-1, :] = ['total', n_trmnt_total, n_ctrl_total, response_rate_trmnt_total,
Expand All @@ -663,6 +665,13 @@ def uplift_by_percentile(y_true, uplift, treatment, strategy='overall',
.set_index('percentile', drop=True, inplace=False) \
.astype({'n_treatment': 'int32', 'n_control': 'int32'})

if add_predicted_uplift:
# add column: mean_predicted_uplift ---------------------------------------------------------------------------
predicted_uplift_df = pd.DataFrame(mean_predicted_uplift_by_bin, columns=['mean_predicted_uplift'])
predicted_uplift_df.set_index(df.index[:bins], inplace=True)
# -------------------------------------------------------------------------------------------------------------
df = pd.concat([df, predicted_uplift_df], axis=1)

return df


Expand Down
17 changes: 13 additions & 4 deletions sklift/viz/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def plot_qini_curve(y_true, uplift, treatment, random=True, perfect=True, negati


def plot_uplift_by_percentile(y_true, uplift, treatment, strategy='overall',
kind='line', bins=10, string_percentiles=True):
kind='line', bins=10, string_percentiles=True, show_predicted_uplift: bool=False):
"""Plot uplift score, treatment response rate and control response rate at each percentile.

Treatment response rate ia a target mean in the treatment group.
Expand Down Expand Up @@ -176,7 +176,7 @@ def plot_uplift_by_percentile(y_true, uplift, treatment, strategy='overall',

bins (int): Determines а number of bins (and the relative percentile) in the test data. Default is 10.
string_percentiles (bool): type of xticks: float or string to plot. Default is True (string).

show_predicted_uplift (bool): whether to show predicted uplift in each bin
Returns:
Object that stores computed values.
"""
Expand Down Expand Up @@ -209,7 +209,8 @@ def plot_uplift_by_percentile(y_true, uplift, treatment, strategy='overall',
f' Invalid value string_percentiles: {string_percentiles}')

df = uplift_by_percentile(y_true, uplift, treatment, strategy=strategy,
std=True, total=True, bins=bins, string_percentiles=False)
std=True, total=True, bins=bins, string_percentiles=False,
add_predicted_uplift=show_predicted_uplift)

percentiles = df.index[:bins].values.astype(float)

Expand All @@ -222,6 +223,9 @@ def plot_uplift_by_percentile(y_true, uplift, treatment, strategy='overall',
uplift_score = df.loc[percentiles, 'uplift'].values
std_uplift = df.loc[percentiles, 'std_uplift'].values

if show_predicted_uplift:
mean_predicted_uplift_score = df.loc[percentiles, 'mean_predicted_uplift'].values

uplift_weighted_avg = df.loc['total', 'uplift']

check_consistent_length(percentiles, response_rate_trmnt,
Expand Down Expand Up @@ -268,7 +272,11 @@ def plot_uplift_by_percentile(y_true, uplift, treatment, strategy='overall',
axes[1].bar(np.array(percentiles) + delta / 6, response_rate_ctrl, delta / 3,
yerr=std_ctrl, color='orange', label='control\nresponse rate')
axes[0].bar(np.array(percentiles), uplift_score, delta / 1.5,
yerr=std_uplift, color='red', label='uplift')
yerr=std_uplift, color='red', label='uplift', alpha=0.5 if show_predicted_uplift else 1)

if show_predicted_uplift:
axes[0].bar(np.array(percentiles), mean_predicted_uplift_score, delta / 1.5,
color='blue', label='avg pred uplift', alpha=0.5)

axes[0].legend(loc='upper right')
axes[0].tick_params(axis='x', bottom=False)
Expand All @@ -293,6 +301,7 @@ def plot_uplift_by_percentile(y_true, uplift, treatment, strategy='overall',
return axes



def plot_treatment_balance_curve(uplift, treatment, random=True, winsize=0.1):
"""Plot Treatment Balance curve.

Expand Down