Skip to content

Commit 2e32822

Browse files
Merge pull request #325 from PaulWestenthanner/refactor/base_class
Refactor/base class
2 parents a18cb64 + f3afca8 commit 2e32822

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+669
-2330
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ coverage.xml
5656

5757
# Sphinx documentation
5858
docs/_build/
59+
docs/build/
5960

6061
# PyBuilder
6162
target/
@@ -68,4 +69,4 @@ runtest.py
6869

6970
*~
7071
*.swp
71-
*.swo
72+
*.swo

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ References
137137
1. Kilian Weinberger; Anirban Dasgupta; John Langford; Alex Smola; Josh Attenberg (2009). Feature Hashing for Large Scale Multitask Learning. Proc. ICML.
138138
2. Contrast Coding Systems for categorical variables. UCLA: Statistical Consulting Group. From https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/.
139139
3. Gregory Carey (2003). Coding Categorical Variables. From http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf
140-
4. Strategies to encode categorical variables with many categories. From https://www.kaggle.com/c/caterpillar-tube-pricing/discussion/15748#143154.
140+
4. Owen Zhang - Leave One Out Encoding. From https://datascience.stackexchange.com/questions/10839/what-is-difference-between-one-hot-encoding-and-leave-one-out-encoding
141141
5. Beyond One-Hot: an exploration of categorical variables. From http://www.willmcginnis.com/2015/11/29/beyond-one-hot-an-exploration-of-categorical-variables/
142142
6. BaseN Encoding and Grid Search in categorical variables. From http://www.willmcginnis.com/2016/12/18/basen-encoding-grid-search-category_encoders/
143143
7. Daniele Miccii-Barreca (2001). A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification and Prediction Problems. SIGKDD Explor. Newsl. 3, 1. From http://dx.doi.org/10.1145/507533.507538

category_encoders/backward_difference.py

Lines changed: 8 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""Backward difference contrast encoding"""
22

33
import pandas as pd
4-
from sklearn.base import BaseEstimator, TransformerMixin
54
from patsy.contrasts import Diff
65
import numpy as np
76
from category_encoders.ordinal import OrdinalEncoder
@@ -10,7 +9,7 @@
109
__author__ = 'willmcginnis'
1110

1211

13-
class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin):
12+
class BackwardDifferenceEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
1413
"""Backward difference contrast coding for encoding categorical variables.
1514
1615
Parameters
@@ -82,58 +81,17 @@ class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin):
8281
http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf
8382
8483
"""
84+
prefit_ordinal = True
85+
encoding_relation = util.EncodingRelation.ONE_TO_N_UNIQUE
8586

8687
def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True,
8788
handle_unknown='value', handle_missing='value'):
88-
self.return_df = return_df
89-
self.drop_invariant = drop_invariant
90-
self.drop_cols = []
91-
self.verbose = verbose
89+
super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df,
90+
handle_unknown=handle_unknown, handle_missing=handle_missing)
9291
self.mapping = mapping
93-
self.handle_unknown = handle_unknown
94-
self.handle_missing = handle_missing
95-
self.cols = cols
9692
self.ordinal_encoder = None
97-
self._dim = None
98-
self.feature_names = None
99-
100-
def fit(self, X, y=None, **kwargs):
101-
"""Fits an ordinal encoder to produce a consistent mapping across applications and optionally finds
102-
generally invariant columns to drop consistently.
103-
104-
Parameters
105-
----------
106-
107-
X : array-like, shape = [n_samples, n_features]
108-
Training vectors, where n_samples is the number of samples
109-
and n_features is the number of features.
110-
y : array-like, shape = [n_samples]
111-
Target values.
112-
113-
Returns
114-
-------
115-
116-
self : encoder
117-
Returns self.
118-
119-
"""
120-
121-
# if the input dataset isn't already a dataframe, convert it to one (using default column names)
122-
# first check the type
123-
X = util.convert_input(X)
124-
125-
self._dim = X.shape[1]
126-
127-
# if columns aren't passed, just use every string column
128-
if self.cols is None:
129-
self.cols = util.get_obj_cols(X)
130-
else:
131-
self.cols = util.convert_cols_to_list(self.cols)
132-
133-
if self.handle_missing == 'error':
134-
if X[self.cols].isnull().any().any():
135-
raise ValueError('Columns to be encoded can not contain null')
13693

94+
def _fit(self, X, y=None, **kwargs):
13795
# train an ordinal pre-encoder
13896
self.ordinal_encoder = OrdinalEncoder(
13997
verbose=self.verbose,
@@ -155,70 +113,14 @@ def fit(self, X, y=None, **kwargs):
155113

156114
self.mapping = mappings_out
157115

158-
X_temp = self.transform(X, override_return_df=True)
159-
self.feature_names = X_temp.columns.tolist()
160-
161-
# drop all output columns with 0 variance.
162-
if self.drop_invariant:
163-
self.drop_cols = []
164-
generated_cols = util.get_generated_cols(X, X_temp, self.cols)
165-
self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5]
166-
try:
167-
[self.feature_names.remove(x) for x in self.drop_cols]
168-
except KeyError as e:
169-
if self.verbose > 0:
170-
print(f"Could not remove column from feature names. Not found in generated cols.\n{e}")
171-
172-
return self
173-
174-
def transform(self, X, override_return_df=False):
175-
"""Perform the transformation to new categorical data.
176-
177-
Parameters
178-
----------
179-
180-
X : array-like, shape = [n_samples, n_features]
181-
182-
Returns
183-
-------
184-
185-
p : array, shape = [n_samples, n_numeric + N]
186-
Transformed values with encoding applied.
187-
188-
"""
189-
190-
if self.handle_missing == 'error':
191-
if X[self.cols].isnull().any().any():
192-
raise ValueError('Columns to be encoded can not contain null')
193-
194-
if self._dim is None:
195-
raise ValueError('Must train encoder before it can be used to transform data.')
196-
197-
# first check the type
198-
X = util.convert_input(X)
199-
200-
# then make sure that it is the right size
201-
if X.shape[1] != self._dim:
202-
raise ValueError(f'Unexpected input dimension {X.shape[1]}, expected {self._dim}')
203-
204-
if not list(self.cols):
205-
return X
206-
116+
def _transform(self, X) -> pd.DataFrame:
207117
X = self.ordinal_encoder.transform(X)
208-
209118
if self.handle_unknown == 'error':
210119
if X[self.cols].isin([-1]).any().any():
211120
raise ValueError('Columns to be encoded can not contain new values')
212121

213122
X = self.backward_difference_coding(X, mapping=self.mapping)
214-
215-
if self.drop_invariant:
216-
X = X.drop(columns=self.drop_cols)
217-
218-
if self.return_df or override_return_df:
219-
return X
220-
else:
221-
return X.values
123+
return X
222124

223125
@staticmethod
224126
def fit_backward_difference_coding(col, values, handle_missing, handle_unknown):
@@ -274,20 +176,3 @@ def backward_difference_coding(X_in, mapping):
274176
cols = ['intercept'] + cols
275177

276178
return X.reindex(columns=cols)
277-
278-
def get_feature_names(self):
279-
"""
280-
Returns the names of all transformed / added columns.
281-
282-
Returns
283-
-------
284-
feature_names: list
285-
A list with all feature names transformed or added.
286-
Note: potentially dropped features are not included!
287-
288-
"""
289-
290-
if not isinstance(self.feature_names, list):
291-
raise ValueError('Must fit data first. Affected feature names are not known before.')
292-
else:
293-
return self.feature_names

category_encoders/basen.py

Lines changed: 10 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,7 @@
22

33
import pandas as pd
44
import numpy as np
5-
import math
65
import re
7-
from sklearn.base import BaseEstimator, TransformerMixin
86
from category_encoders.ordinal import OrdinalEncoder
97
import category_encoders.utils as util
108
import warnings
@@ -33,7 +31,7 @@ def _ceillogint(n, base):
3331
return ret
3432

3533

36-
class BaseNEncoder(BaseEstimator, TransformerMixin):
34+
class BaseNEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
3735
"""Base-N encoder encodes the categories into arrays of their base-N representation. A base of 1 is equivalent to
3836
one-hot encoding (not really base-1, but useful), a base of 2 is equivalent to binary encoding. N=number of actual
3937
categories is equivalent to vanilla ordinal encoding.
@@ -98,57 +96,18 @@ class BaseNEncoder(BaseEstimator, TransformerMixin):
9896
9997
"""
10098

99+
prefit_ordinal = True
100+
encoding_relation = util.EncodingRelation.N_TO_M
101+
101102
def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, base=2,
102103
handle_unknown='value', handle_missing='value'):
103-
self.return_df = return_df
104-
self.drop_invariant = drop_invariant
105-
self.drop_cols = []
106-
self.verbose = verbose
107-
self.handle_unknown = handle_unknown
108-
self.handle_missing = handle_missing
109-
self.cols = cols
104+
super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df,
105+
handle_unknown=handle_unknown, handle_missing=handle_missing)
110106
self.mapping = mapping
111107
self.ordinal_encoder = None
112-
self._dim = None
113108
self.base = base
114-
self._encoded_columns = None
115-
self.feature_names = None
116-
117-
def fit(self, X, y=None, **kwargs):
118-
"""Fit encoder according to X and y.
119-
120-
Parameters
121-
----------
122-
123-
X : array-like, shape = [n_samples, n_features]
124-
Training vectors, where n_samples is the number of samples
125-
and n_features is the number of features.
126-
y : array-like, shape = [n_samples]
127-
Target values.
128-
129-
Returns
130-
-------
131-
132-
self : encoder
133-
Returns self.
134-
135-
"""
136-
137-
# if the input dataset isn't already a dataframe, convert it to one (using default column names)
138-
X = util.convert_input(X)
139-
140-
self._dim = X.shape[1]
141-
142-
# if columns aren't passed, just use every string column
143-
if self.cols is None:
144-
self.cols = util.get_obj_cols(X)
145-
else:
146-
self.cols = util.convert_cols_to_list(self.cols)
147-
148-
if self.handle_missing == 'error':
149-
if X[self.cols].isnull().any().any():
150-
raise ValueError('Columns to be encoded can not contain null')
151109

110+
def _fit(self, X, y=None, **kwargs):
152111
# train an ordinal pre-encoder
153112
self.ordinal_encoder = OrdinalEncoder(
154113
verbose=self.verbose,
@@ -160,24 +119,6 @@ def fit(self, X, y=None, **kwargs):
160119

161120
self.mapping = self.fit_base_n_encoding(X)
162121

163-
# do a transform on the training data to get a column list
164-
X_temp = self.transform(X, override_return_df=True)
165-
self._encoded_columns = X_temp.columns.values
166-
self.feature_names = list(X_temp.columns)
167-
168-
# drop all output columns with 0 variance.
169-
if self.drop_invariant:
170-
self.drop_cols = []
171-
generated_cols = util.get_generated_cols(X, X_temp, self.cols)
172-
self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5]
173-
try:
174-
[self.feature_names.remove(x) for x in self.drop_cols]
175-
except KeyError as e:
176-
if self.verbose > 0:
177-
print(f"Could not remove column from feature names. Not found in generated cols.\n{e}")
178-
179-
return self
180-
181122
def fit_base_n_encoding(self, X):
182123
mappings_out = []
183124

@@ -210,58 +151,15 @@ def fit_base_n_encoding(self, X):
210151

211152
return mappings_out
212153

213-
def transform(self, X, override_return_df=False):
214-
"""Perform the transformation to new categorical data.
215-
216-
Parameters
217-
----------
218-
219-
X : array-like, shape = [n_samples, n_features]
220-
221-
Returns
222-
-------
223-
224-
p : array, shape = [n_samples, n_numeric + N]
225-
Transformed values with encoding applied.
226-
227-
"""
228-
229-
if self.handle_missing == 'error':
230-
if X[self.cols].isnull().any().any():
231-
raise ValueError('Columns to be encoded can not contain null')
232-
233-
if self._dim is None:
234-
raise ValueError('Must train encoder before it can be used to transform data.')
235-
236-
# first check the type
237-
X = util.convert_input(X)
238-
239-
# then make sure that it is the right size
240-
if X.shape[1] != self._dim:
241-
raise ValueError(f'Unexpected input dimension {X.shape[1]}, expected {self._dim}')
242-
243-
if not list(self.cols):
244-
return X
245-
154+
def _transform(self, X):
246155
X_out = self.ordinal_encoder.transform(X)
247156

248157
if self.handle_unknown == 'error':
249158
if X_out[self.cols].isin([-1]).any().any():
250159
raise ValueError('Columns to be encoded can not contain new values')
251160

252161
X_out = self.basen_encode(X_out, cols=self.cols)
253-
254-
if self.drop_invariant:
255-
X_out = X_out.drop(columns=self.drop_cols)
256-
257-
# impute missing values only in the generated columns
258-
# generated_cols = util.get_generated_cols(X, X_out, self.cols)
259-
# X_out[generated_cols] = X_out[generated_cols].fillna(value=0.0)
260-
261-
if self.return_df or override_return_df:
262-
return X_out
263-
else:
264-
return X_out.values
162+
return X_out
265163

266164
def inverse_transform(self, X_in):
267165
"""
@@ -306,7 +204,7 @@ def inverse_transform(self, X_in):
306204
for col in self.cols:
307205
if X[switch.get('col')].isnull().any():
308206
warnings.warn("inverse_transform is not supported because transform impute "
309-
"the unknown category nan when encode %s" % (col,))
207+
f"the unknown category nan when encode {col}")
310208

311209
return X if self.return_df else X.values
312210

@@ -415,20 +313,3 @@ def number_to_base(n, b, limit):
415313
n, _ = divmod(n, b)
416314

417315
return digits[::-1]
418-
419-
def get_feature_names(self):
420-
"""
421-
Returns the names of all transformed / added columns.
422-
423-
Returns
424-
-------
425-
feature_names: list
426-
A list with all feature names transformed or added.
427-
Note: potentially dropped features are not included!
428-
429-
"""
430-
431-
if not isinstance(self.feature_names, list):
432-
raise ValueError('Must fit data first. Affected feature names are not known before.')
433-
else:
434-
return self.feature_names

0 commit comments

Comments
 (0)