|
1 | 1 | """Target Encoder"""
|
| 2 | +import warnings |
2 | 3 | import numpy as np
|
3 | 4 | import pandas as pd
|
4 | 5 | from sklearn.base import BaseEstimator
|
@@ -35,10 +36,12 @@ class TargetEncoder(BaseEstimator, util.TransformerWithTargetMixin):
|
35 | 36 | handle_unknown: str
|
36 | 37 | options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean.
|
37 | 38 | min_samples_leaf: int
|
38 |
| - minimum samples to take category average into account. |
| 39 | + For regularization the weighted average between category mean and global mean is taken. The weight is |
| 40 | + an S-shaped curve between 0 and 1 with the number of samples for a category on the x-axis. |
| 41 | + The curve reaches 0.5 at min_samples_leaf. (parameter k in the original paper) |
39 | 42 | smoothing: float
|
40 | 43 | smoothing effect to balance categorical average vs prior. Higher value means stronger regularization.
|
41 |
| - The value must be strictly bigger than 0. |
| 44 | + The value must be strictly bigger than 0. Higher values mean a flatter S-curve (see min_samples_leaf). |
42 | 45 |
|
43 | 46 | Example
|
44 | 47 | -------
|
@@ -88,7 +91,13 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, h
|
88 | 91 | self.cols = cols
|
89 | 92 | self.ordinal_encoder = None
|
90 | 93 | self.min_samples_leaf = min_samples_leaf
|
91 |
| - self.smoothing = float(smoothing) # Make smoothing a float so that python 2 does not treat as integer division |
| 94 | + if min_samples_leaf == 1: |
| 95 | + warnings.warn("Default parameter min_samples_leaf will change in version 2.6." |
| 96 | + "See https://github.com/scikit-learn-contrib/category_encoders/issues/327") |
| 97 | + self.smoothing = smoothing |
| 98 | + if min_samples_leaf == 1.0: |
| 99 | + warnings.warn("Default parameter smoothing will change in version 2.6." |
| 100 | + "See https://github.com/scikit-learn-contrib/category_encoders/issues/327") |
92 | 101 | self._dim = None
|
93 | 102 | self.mapping = None
|
94 | 103 | self.handle_unknown = handle_unknown
|
|
0 commit comments