maks-sh · andreitsev · May 28, 2021
diff --git a/sklift/metrics/metrics.py b/sklift/metrics/metrics.py
@@ -430,7 +430,7 @@ def response_rate_by_percentile(y_true, uplift, treatment, group, strategy='over
                 sorted by uplift predictions. Then the difference between these conversions is calculated.
 
         bins (int): Determines the number of bins (and relative percentile) in the data. Default is 10.
-        
+
     Returns:
         array (shape = [>2]), array (shape = [>2]), array (shape = [>2]):
         response rate at each percentile for control or treatment group,
@@ -443,44 +443,46 @@ def response_rate_by_percentile(y_true, uplift, treatment, group, strategy='over
 
     group_types = ['treatment', 'control']
     strategy_methods = ['overall', 'by_group']
-    
+
     n_samples = len(y_true)
-    
+
     if group not in group_types:
         raise ValueError(f'Response rate supports only group types in {group_types},'
-                         f' got {group}.') 
+                         f' got {group}.')
 
     if strategy not in strategy_methods:
         raise ValueError(f'Response rate supports only calculating methods in {strategy_methods},'
                          f' got {strategy}.')
-    
+
     if not isinstance(bins, int) or bins <= 0:
         raise ValueError(f'Bins should be positive integer. Invalid value bins: {bins}')
 
     if bins >= n_samples:
         raise ValueError(f'Number of bins = {bins} should be smaller than the length of y_true {n_samples}')
-    
+
     y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array(treatment)
     order = np.argsort(uplift, kind='mergesort')[::-1]
+    predicted_uplift_bin = np.array_split(uplift[order], bins)
+    mean_predicted_uplift_by_bin = np.array([np.mean(current_bin_uplft) for current_bin_uplft in predicted_uplift_bin])
 
     trmnt_flag = 1 if group == 'treatment' else 0
-    
+
     if strategy == 'overall':
         y_true_bin = np.array_split(y_true[order], bins)
         trmnt_bin = np.array_split(treatment[order], bins)
-        
+
         group_size = np.array([len(y[trmnt == trmnt_flag]) for y, trmnt in zip(y_true_bin, trmnt_bin)])
         response_rate = np.array([np.mean(y[trmnt == trmnt_flag]) for y, trmnt in zip(y_true_bin, trmnt_bin)])
 
     else:  # strategy == 'by_group'
         y_bin = np.array_split(y_true[order][treatment[order] == trmnt_flag], bins)
-        
+
         group_size = np.array([len(y) for y in y_bin])
         response_rate = np.array([np.mean(y) for y in y_bin])
 
     variance = np.multiply(response_rate, np.divide((1 - response_rate), group_size))
 
-    return response_rate, variance, group_size
+    return response_rate, variance, group_size, mean_predicted_uplift_by_bin
 
 
 def weighted_average_uplift(y_true, uplift, treatment, strategy='overall', bins=10):
@@ -527,10 +529,10 @@ def weighted_average_uplift(y_true, uplift, treatment, strategy='overall', bins=
     if bins >= n_samples:
         raise ValueError(f'Number of bins = {bins} should be smaller than the length of y_true {n_samples}')
 
-    response_rate_trmnt, variance_trmnt, n_trmnt = response_rate_by_percentile(
+    response_rate_trmnt, variance_trmnt, n_trmnt, *_ = response_rate_by_percentile(
         y_true, uplift, treatment, group='treatment', strategy=strategy, bins=bins)
 
-    response_rate_ctrl, variance_ctrl, n_ctrl = response_rate_by_percentile(
+    response_rate_ctrl, variance_ctrl, n_ctrl, *_ = response_rate_by_percentile(
         y_true, uplift, treatment, group='control', strategy=strategy, bins=bins)
 
     uplift_scores = response_rate_trmnt - response_rate_ctrl
@@ -541,7 +543,7 @@ def weighted_average_uplift(y_true, uplift, treatment, strategy='overall', bins=
 
 
 def uplift_by_percentile(y_true, uplift, treatment, strategy='overall',
-                         bins=10, std=False, total=False, string_percentiles=True):
+                         bins=10, std=False, total=False, string_percentiles=True, add_predicted_uplift: bool = False):
     """Compute metrics: uplift, group size, group response rate, standard deviation at each percentile.
 
     Metrics in columns and percentiles in rows of pandas DataFrame:
@@ -573,7 +575,8 @@ def uplift_by_percentile(y_true, uplift, treatment, strategy='overall',
             The total response rate is a response rate on the full data amount.
         bins (int): Determines the number of bins (and the relative percentile) in the data. Default is 10.
         string_percentiles (bool): type of percentiles in the index: float or string. Default is True (string).
-
+        add_predicted_uplift (bool): if True, concats another column to resulting dataframe
+            (column with mean predicted uplift in bin)
     Returns:
         pandas.DataFrame: DataFrame where metrics are by columns and percentiles are by rows.
     """
@@ -610,10 +613,10 @@ def uplift_by_percentile(y_true, uplift, treatment, strategy='overall',
 
     y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array(treatment)
 
-    response_rate_trmnt, variance_trmnt, n_trmnt = response_rate_by_percentile(
+    response_rate_trmnt, variance_trmnt, n_trmnt, mean_predicted_uplift = response_rate_by_percentile(
         y_true, uplift, treatment, group='treatment', strategy=strategy, bins=bins)
 
-    response_rate_ctrl, variance_ctrl, n_ctrl = response_rate_by_percentile(
+    response_rate_ctrl, variance_ctrl, n_ctrl, *_ = response_rate_by_percentile(
         y_true, uplift, treatment, group='control', strategy=strategy, bins=bins)
 
     uplift_scores = response_rate_trmnt - response_rate_ctrl
@@ -623,8 +626,7 @@ def uplift_by_percentile(y_true, uplift, treatment, strategy='overall',
 
     if string_percentiles:
         percentiles = [f"0-{percentiles[0]}"] + \
-            [f"{percentiles[i]}-{percentiles[i + 1]}" for i in range(len(percentiles) - 1)]
-
+                      [f"{percentiles[i]}-{percentiles[i + 1]}" for i in range(len(percentiles) - 1)]
 
     df = pd.DataFrame({
         'percentile': percentiles,
@@ -636,10 +638,10 @@ def uplift_by_percentile(y_true, uplift, treatment, strategy='overall',
     })
 
     if total:
-        response_rate_trmnt_total, variance_trmnt_total, n_trmnt_total = response_rate_by_percentile(
+        response_rate_trmnt_total, variance_trmnt_total, n_trmnt_total, *_ = response_rate_by_percentile(
             y_true, uplift, treatment, strategy=strategy, group='treatment', bins=1)
 
-        response_rate_ctrl_total, variance_ctrl_total, n_ctrl_total = response_rate_by_percentile(
+        response_rate_ctrl_total, variance_ctrl_total, n_ctrl_total, *_ = response_rate_by_percentile(
             y_true, uplift, treatment, strategy=strategy, group='control', bins=1)
 
         df.loc[-1, :] = ['total', n_trmnt_total, n_ctrl_total, response_rate_trmnt_total,
@@ -663,6 +665,13 @@ def uplift_by_percentile(y_true, uplift, treatment, strategy='overall',
         .set_index('percentile', drop=True, inplace=False) \
         .astype({'n_treatment': 'int32', 'n_control': 'int32'})
 
+    if add_predicted_uplift:
+        # add column: mean_predicted_uplift ---------------------------------------------------------------------------
+        predicted_uplift_df = pd.DataFrame(mean_predicted_uplift_by_bin, columns=['mean_predicted_uplift'])
+        predicted_uplift_df.set_index(df.index[:bins], inplace=True)
+        # -------------------------------------------------------------------------------------------------------------
+        df = pd.concat([df, predicted_uplift_df], axis=1)
+
     return df
 
 

diff --git a/sklift/viz/base.py b/sklift/viz/base.py
@@ -146,7 +146,7 @@ def plot_qini_curve(y_true, uplift, treatment, random=True, perfect=True, negati
 
 
 def plot_uplift_by_percentile(y_true, uplift, treatment, strategy='overall',
-                              kind='line', bins=10, string_percentiles=True):
+                              kind='line', bins=10, string_percentiles=True, show_predicted_uplift: bool=False):
     """Plot uplift score, treatment response rate and control response rate at each percentile.
 
     Treatment response rate ia a target mean in the treatment group.
@@ -176,7 +176,7 @@ def plot_uplift_by_percentile(y_true, uplift, treatment, strategy='overall',
 
         bins (int): Determines а number of bins (and the relative percentile) in the test data. Default is 10.
         string_percentiles (bool): type of xticks: float or string to plot. Default is True (string).
-
+        show_predicted_uplift (bool): whether to show predicted uplift in each bin
     Returns:
         Object that stores computed values.
     """
@@ -209,7 +209,8 @@ def plot_uplift_by_percentile(y_true, uplift, treatment, strategy='overall',
                          f' Invalid value string_percentiles: {string_percentiles}')
 
     df = uplift_by_percentile(y_true, uplift, treatment, strategy=strategy,
-                              std=True, total=True, bins=bins, string_percentiles=False)
+                              std=True, total=True, bins=bins, string_percentiles=False,
+                              add_predicted_uplift=show_predicted_uplift)
 
     percentiles = df.index[:bins].values.astype(float)
 
@@ -222,6 +223,9 @@ def plot_uplift_by_percentile(y_true, uplift, treatment, strategy='overall',
     uplift_score = df.loc[percentiles, 'uplift'].values
     std_uplift = df.loc[percentiles, 'std_uplift'].values
 
+    if show_predicted_uplift:
+        mean_predicted_uplift_score = df.loc[percentiles, 'mean_predicted_uplift'].values
+
     uplift_weighted_avg = df.loc['total', 'uplift']
 
     check_consistent_length(percentiles, response_rate_trmnt,
@@ -268,7 +272,11 @@ def plot_uplift_by_percentile(y_true, uplift, treatment, strategy='overall',
         axes[1].bar(np.array(percentiles) + delta / 6, response_rate_ctrl, delta / 3,
                     yerr=std_ctrl, color='orange', label='control\nresponse rate')
         axes[0].bar(np.array(percentiles), uplift_score, delta / 1.5,
-                    yerr=std_uplift, color='red', label='uplift')
+                    yerr=std_uplift, color='red', label='uplift', alpha=0.5 if show_predicted_uplift else 1)
+
+        if show_predicted_uplift:
+            axes[0].bar(np.array(percentiles), mean_predicted_uplift_score, delta / 1.5,
+                        color='blue', label='avg pred uplift', alpha=0.5)
 
         axes[0].legend(loc='upper right')
         axes[0].tick_params(axis='x', bottom=False)
@@ -293,6 +301,7 @@ def plot_uplift_by_percentile(y_true, uplift, treatment, strategy='overall',
     return axes
 
 
+
 def plot_treatment_balance_curve(uplift, treatment, random=True, winsize=0.1):
     """Plot Treatment Balance curve.