scikit-learn-contrib
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎category_encoders/backward_difference.py‎
Lines changed: 8 additions & 123 deletions b/‎category_encoders/backward_difference.py‎
Lines changed: 8 additions & 123 deletions
diff --git a/‎category_encoders/basen.py‎
Lines changed: 10 additions & 129 deletions b/‎category_encoders/basen.py‎
Lines changed: 10 additions & 129 deletions
@@ -56,6 +56,7 @@ coverage.xml
 
 # Sphinx documentation
 docs/_build/
+docs/build/
 
 # PyBuilder
 target/
@@ -68,4 +69,4 @@ runtest.py
 
 *~
 *.swp
-*.swo
+*.swo
@@ -137,7 +137,7 @@ References
  1. Kilian Weinberger; Anirban Dasgupta; John Langford; Alex Smola; Josh Attenberg (2009). Feature Hashing for Large Scale Multitask Learning. Proc. ICML.
  2. Contrast Coding Systems for categorical variables.  UCLA: Statistical Consulting Group. From https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/.
  3. Gregory Carey (2003). Coding Categorical Variables. From http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf
- 4. Strategies to encode categorical variables with many categories. From https://www.kaggle.com/c/caterpillar-tube-pricing/discussion/15748#143154.
+ 4. Owen Zhang - Leave One Out Encoding. From https://datascience.stackexchange.com/questions/10839/what-is-difference-between-one-hot-encoding-and-leave-one-out-encoding
  5. Beyond One-Hot: an exploration of categorical variables. From http://www.willmcginnis.com/2015/11/29/beyond-one-hot-an-exploration-of-categorical-variables/
  6. BaseN Encoding and Grid Search in categorical variables. From http://www.willmcginnis.com/2016/12/18/basen-encoding-grid-search-category_encoders/
  7. Daniele Miccii-Barreca (2001). A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification and Prediction Problems. SIGKDD Explor. Newsl. 3, 1. From http://dx.doi.org/10.1145/507533.507538
 
@@ -1,7 +1,6 @@
 """Backward difference contrast encoding"""
 
 import pandas as pd
-from sklearn.base import BaseEstimator, TransformerMixin
 from patsy.contrasts import Diff
 import numpy as np
 from category_encoders.ordinal import OrdinalEncoder
@@ -10,7 +9,7 @@
 __author__ = 'willmcginnis'
 
 
-class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin):
+class BackwardDifferenceEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
     """Backward difference contrast coding for encoding categorical variables.
 
     Parameters
@@ -82,58 +81,17 @@ class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin):
     http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf
 
     """
+    prefit_ordinal = True
+    encoding_relation = util.EncodingRelation.ONE_TO_N_UNIQUE
 
     def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True,
                  handle_unknown='value', handle_missing='value'):
-        self.return_df = return_df
-        self.drop_invariant = drop_invariant
-        self.drop_cols = []
-        self.verbose = verbose
+        super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df,
+                         handle_unknown=handle_unknown, handle_missing=handle_missing)
         self.mapping = mapping
-        self.handle_unknown = handle_unknown
-        self.handle_missing = handle_missing
-        self.cols = cols
         self.ordinal_encoder = None
-        self._dim = None
-        self.feature_names = None
-
-    def fit(self, X, y=None, **kwargs):
-        """Fits an ordinal encoder to produce a consistent mapping across applications and optionally finds
-        generally invariant columns to drop consistently.
-
-        Parameters
-        ----------
-
-        X : array-like, shape = [n_samples, n_features]
-            Training vectors, where n_samples is the number of samples
-            and n_features is the number of features.
-        y : array-like, shape = [n_samples]
-            Target values.
-
-        Returns
-        -------
-
-        self : encoder
-            Returns self.
-
-        """
-
-        # if the input dataset isn't already a dataframe, convert it to one (using default column names)
-        # first check the type
-        X = util.convert_input(X)
-
-        self._dim = X.shape[1]
-
-        # if columns aren't passed, just use every string column
-        if self.cols is None:
-            self.cols = util.get_obj_cols(X)
-        else:
-            self.cols = util.convert_cols_to_list(self.cols)
-
-        if self.handle_missing == 'error':
-            if X[self.cols].isnull().any().any():
-                raise ValueError('Columns to be encoded can not contain null')
 
+    def _fit(self, X, y=None, **kwargs):
         # train an ordinal pre-encoder
         self.ordinal_encoder = OrdinalEncoder(
             verbose=self.verbose,
@@ -155,70 +113,14 @@ def fit(self, X, y=None, **kwargs):
 
         self.mapping = mappings_out
 
-        X_temp = self.transform(X, override_return_df=True)
-        self.feature_names = X_temp.columns.tolist()
-
-        # drop all output columns with 0 variance.
-        if self.drop_invariant:
-            self.drop_cols = []
-            generated_cols = util.get_generated_cols(X, X_temp, self.cols)
-            self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5]
-            try:
-                [self.feature_names.remove(x) for x in self.drop_cols]
-            except KeyError as e:
-                if self.verbose > 0:
-                    print(f"Could not remove column from feature names. Not found in generated cols.\n{e}")
-
-        return self
-
-    def transform(self, X, override_return_df=False):
-        """Perform the transformation to new categorical data.
-
-        Parameters
-        ----------
-
-        X : array-like, shape = [n_samples, n_features]
-
-        Returns
-        -------
-
-        p : array, shape = [n_samples, n_numeric + N]
-            Transformed values with encoding applied.
-
-        """
-
-        if self.handle_missing == 'error':
-            if X[self.cols].isnull().any().any():
-                raise ValueError('Columns to be encoded can not contain null')
-
-        if self._dim is None:
-            raise ValueError('Must train encoder before it can be used to transform data.')
-
-        # first check the type
-        X = util.convert_input(X)
-
-        # then make sure that it is the right size
-        if X.shape[1] != self._dim:
-            raise ValueError(f'Unexpected input dimension {X.shape[1]}, expected {self._dim}')
-
-        if not list(self.cols):
-            return X
-
+    def _transform(self, X) -> pd.DataFrame:
         X = self.ordinal_encoder.transform(X)
-
         if self.handle_unknown == 'error':
             if X[self.cols].isin([-1]).any().any():
                 raise ValueError('Columns to be encoded can not contain new values')
 
         X = self.backward_difference_coding(X, mapping=self.mapping)
-
-        if self.drop_invariant:
-            X = X.drop(columns=self.drop_cols)
-
-        if self.return_df or override_return_df:
-            return X
-        else:
-            return X.values
+        return X
 
     @staticmethod
     def fit_backward_difference_coding(col, values, handle_missing, handle_unknown):
@@ -274,20 +176,3 @@ def backward_difference_coding(X_in, mapping):
         cols = ['intercept'] + cols
 
         return X.reindex(columns=cols)
-
-    def get_feature_names(self):
-        """
-        Returns the names of all transformed / added columns.
-
-        Returns
-        -------
-        feature_names: list
-            A list with all feature names transformed or added.
-            Note: potentially dropped features are not included!
-
-        """
-
-        if not isinstance(self.feature_names, list):
-            raise ValueError('Must fit data first. Affected feature names are not known before.')
-        else:
-            return self.feature_names
@@ -2,9 +2,7 @@
 
 import pandas as pd
 import numpy as np
-import math
 import re
-from sklearn.base import BaseEstimator, TransformerMixin
 from category_encoders.ordinal import OrdinalEncoder
 import category_encoders.utils as util
 import warnings
@@ -33,7 +31,7 @@ def _ceillogint(n, base):
     return ret
 
 
-class BaseNEncoder(BaseEstimator, TransformerMixin):
+class BaseNEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
     """Base-N encoder encodes the categories into arrays of their base-N representation.  A base of 1 is equivalent to
     one-hot encoding (not really base-1, but useful), a base of 2 is equivalent to binary encoding. N=number of actual
     categories is equivalent to vanilla ordinal encoding.
@@ -98,57 +96,18 @@ class BaseNEncoder(BaseEstimator, TransformerMixin):
 
     """
 
+    prefit_ordinal = True
+    encoding_relation = util.EncodingRelation.N_TO_M
+
     def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, base=2,
                  handle_unknown='value', handle_missing='value'):
-        self.return_df = return_df
-        self.drop_invariant = drop_invariant
-        self.drop_cols = []
-        self.verbose = verbose
-        self.handle_unknown = handle_unknown
-        self.handle_missing = handle_missing
-        self.cols = cols
+        super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df,
+                         handle_unknown=handle_unknown, handle_missing=handle_missing)
         self.mapping = mapping
         self.ordinal_encoder = None
-        self._dim = None
         self.base = base
-        self._encoded_columns = None
-        self.feature_names = None
-
-    def fit(self, X, y=None, **kwargs):
-        """Fit encoder according to X and y.
-
-        Parameters
-        ----------
-
-        X : array-like, shape = [n_samples, n_features]
-            Training vectors, where n_samples is the number of samples
-            and n_features is the number of features.
-        y : array-like, shape = [n_samples]
-            Target values.
-
-        Returns
-        -------
-
-        self : encoder
-            Returns self.
-
-        """
-
-        # if the input dataset isn't already a dataframe, convert it to one (using default column names)
-        X = util.convert_input(X)
-
-        self._dim = X.shape[1]
-
-        # if columns aren't passed, just use every string column
-        if self.cols is None:
-            self.cols = util.get_obj_cols(X)
-        else:
-            self.cols = util.convert_cols_to_list(self.cols)
-
-        if self.handle_missing == 'error':
-            if X[self.cols].isnull().any().any():
-                raise ValueError('Columns to be encoded can not contain null')
 
+    def _fit(self, X, y=None, **kwargs):
         # train an ordinal pre-encoder
         self.ordinal_encoder = OrdinalEncoder(
             verbose=self.verbose,
@@ -160,24 +119,6 @@ def fit(self, X, y=None, **kwargs):
 
         self.mapping = self.fit_base_n_encoding(X)
 
-        # do a transform on the training data to get a column list
-        X_temp = self.transform(X, override_return_df=True)
-        self._encoded_columns = X_temp.columns.values
-        self.feature_names = list(X_temp.columns)
-
-        # drop all output columns with 0 variance.
-        if self.drop_invariant:
-            self.drop_cols = []
-            generated_cols = util.get_generated_cols(X, X_temp, self.cols)
-            self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5]
-            try:
-                [self.feature_names.remove(x) for x in self.drop_cols]
-            except KeyError as e:
-                if self.verbose > 0:
-                    print(f"Could not remove column from feature names. Not found in generated cols.\n{e}")
-
-        return self
-
     def fit_base_n_encoding(self, X):
         mappings_out = []
 
@@ -210,58 +151,15 @@ def fit_base_n_encoding(self, X):
 
         return mappings_out
 
-    def transform(self, X, override_return_df=False):
-        """Perform the transformation to new categorical data.
-
-        Parameters
-        ----------
-
-        X : array-like, shape = [n_samples, n_features]
-
-        Returns
-        -------
-
-        p : array, shape = [n_samples, n_numeric + N]
-            Transformed values with encoding applied.
-
-        """
-
-        if self.handle_missing == 'error':
-            if X[self.cols].isnull().any().any():
-                raise ValueError('Columns to be encoded can not contain null')
-
-        if self._dim is None:
-            raise ValueError('Must train encoder before it can be used to transform data.')
-
-        # first check the type
-        X = util.convert_input(X)
-
-        # then make sure that it is the right size
-        if X.shape[1] != self._dim:
-            raise ValueError(f'Unexpected input dimension {X.shape[1]}, expected {self._dim}')
-
-        if not list(self.cols):
-            return X
-
+    def _transform(self, X):
         X_out = self.ordinal_encoder.transform(X)
 
         if self.handle_unknown == 'error':
             if X_out[self.cols].isin([-1]).any().any():
                 raise ValueError('Columns to be encoded can not contain new values')
 
         X_out = self.basen_encode(X_out, cols=self.cols)
-
-        if self.drop_invariant:
-            X_out = X_out.drop(columns=self.drop_cols)
-
-        # impute missing values only in the generated columns
-        # generated_cols = util.get_generated_cols(X, X_out, self.cols)
-        # X_out[generated_cols] = X_out[generated_cols].fillna(value=0.0)
-
-        if self.return_df or override_return_df:
-            return X_out
-        else:
-            return X_out.values
+        return X_out
 
     def inverse_transform(self, X_in):
         """
@@ -306,7 +204,7 @@ def inverse_transform(self, X_in):
                 for col in self.cols:
                     if X[switch.get('col')].isnull().any():
                         warnings.warn("inverse_transform is not supported because transform impute "
-                                      "the unknown category nan when encode %s" % (col,))
+                                      f"the unknown category nan when encode {col}")
 
         return X if self.return_df else X.values
 
@@ -415,20 +313,3 @@ def number_to_base(n, b, limit):
             n, _ = divmod(n, b)
 
         return digits[::-1]
-
-    def get_feature_names(self):
-        """
-        Returns the names of all transformed / added columns.
-
-        Returns
-        -------
-        feature_names: list
-            A list with all feature names transformed or added.
-            Note: potentially dropped features are not included!
-
-        """
-
-        if not isinstance(self.feature_names, list):
-            raise ValueError('Must fit data first. Affected feature names are not known before.')
-        else:
-            return self.feature_names