Merge pull request #1 from nlddfn/hotfix/debug

nlddfn · web-flow · commit aacba86b173e · 2019-10-13T21:03:11.000-04:00
Hotfix/debug
diff --git a/README.md b/README.md
@@ -1,5 +1,6 @@
 # ML Kit: a wrapper around scikit-learn
 ## Getting Started
+Clone the repo and set a virtual environment. Tox expects Python 3.7 nevertheless the code is tested also in python 3.6. Should `make test` fail, please change tox.ini to account for your version of python.
 
 * `make test`
 
diff --git a/mlkit/BaseData.py b/mlkit/BaseData.py
@@ -48,6 +48,10 @@ def __init__(self, df, logger, target=None):
         #  Initiate logging
         self.log = logger
 
+        # Initiate class attribues
+        self.classes = None
+        self.nclas = None
+
     def EDA(self, what=['basic']):
         """Exploratory data analysis. Max(20000 samples)"""
         eda_frac = min(1, 20000 / self._X.shape[0])
diff --git a/mlkit/BaseModel.py b/mlkit/BaseModel.py
@@ -128,6 +128,10 @@ def threshold_optimization_by_metric(self, metric, model):
             return self.ROC_AUC(model)
         elif metric == 'Precision_Recall':
             return self.Precision_Recall(model)
+        elif type(metric) == float:
+            assert 0 < metric < 1., \
+                'If the value for metric is numerical, it should be between 0 and 1'
+            return {0: metric}
         else:
             score_lst = []
             thr_lst = np.linspace(.01, .99, 50, endpoint=True)
diff --git a/mlkit/BaseSupervised.py b/mlkit/BaseSupervised.py
@@ -5,7 +5,6 @@
 from sklearn.model_selection import GridSearchCV
 
 from mlkit.BasePreProcessing import BasePreProcessing
-from mlkit.base_model_utils import matthews_binary
 
 
 class BaseClassifier(BasePreProcessing):
@@ -17,7 +16,7 @@ def base_classifier(self,
                         grid=None,
                         fit_params={},
                         cv_params={},
-                        metric=matthews_binary,
+                        threshold_metric=0.5,
                         return_prob=False,
                         owr=False,
                         model_bin=None):
@@ -88,7 +87,7 @@ def base_classifier(self,
             # If binary, use ROC to set model.class_threshold
             if self.data.nclas == 2:
                 best_est.class_threshold = self.threshold_optimization_by_metric(
-                    metric=metric, model=best_est)
+                    metric=threshold_metric, model=best_est)
                 self.log(
                     'Classification threshold is {}'.format(
                         round(best_est.class_threshold[0], 2)))
diff --git a/mlkit/Pipe.py b/mlkit/Pipe.py
@@ -143,7 +143,7 @@ def run(self, data):
 
         pred_lst = []
         # Create ensemble data obj
-        for i, pipe in enumerate(self.pipe_list):
+        for pipe in self.pipe_list:
             data, model = pipe.run(data)
             pred = data.get_prediction(model)
             pred_lst.append(pred)
@@ -154,7 +154,7 @@ def run(self, data):
 
         # Create Data obj for final estimator
         ens_data = Data(df=ens_df,
-                        log_path=getattr(data, 'log'),
+                        logger=getattr(data, 'log'),
                         target=getattr(data, '_target'))
         setattr(ens_data, 'classes', data.classes)
         setattr(ens_data, 'nclas', data.nclas)
@@ -185,7 +185,7 @@ def predict(self, data):
         ens_df.columns = list(range(ens_df.shape[1]))
 
         ens_data = Data(df=ens_df,
-                        log_path=getattr(data, 'model_path'),
+                        logger=getattr(data, 'log'),
                         target=getattr(data, '_target'))
         pred = self.ens_pip.predict(ens_data)
 
diff --git a/requirements.txt b/requirements.txt
@@ -1,15 +1,15 @@
 category-encoders==1.3.0
 dill==0.2.8.2
 flake8==3.7.7
-imbalanced-learn==0.4.3 
-numpy==1.15.2
-pandas==0.23.4
+imbalanced-learn>=0.4.3 
+numpy>=1.15.2
+pandas>=0.23.4
 pytest==3.6.2
 pytest-cov==2.5.1
 pytest-mock==1.10.4
 pytest-runner==4.4
 setuptools-scm==3.2.0
-scikit-learn==0.20.1
+scikit-learn>=0.20.1
 scikit-optimize==0.5.2
-scipy==1.1.0
-xgboost==0.80
+scipy>=1.1.0
+xgboost>=0.80
diff --git a/tests/test_data.py b/tests/test_data.py
@@ -1,5 +1,5 @@
 
-from mlkit.BaseData import Data, Validator
+from mlkit.BaseData import Data
 from tests.test_base import TestBase
 
 
@@ -16,17 +16,6 @@ def test_basics(self, setup_model):
             what=['basic', 'missing', 'remarkable', 'target_agg', 'target_imbalance']
         )
 
-    def test_validation(self, setup_data):
-        # First introduce fake data and model
-        data = self.data_mock_clas
-
-        v = Validator(
-            data_obj=data,
-            model_path='examples',
-            n_samples=100
-        )
-        v.run_validation(test=data)
-
     def test_mock_models(self, setup_model):
         # Classification
         data_clas = self.data_mock_clas
diff --git a/tox.ini b/tox.ini
@@ -1,7 +1,7 @@
 [tox]
-envlist = py36,flake8
+envlist = py37,flake8
 
-[testenv:py36]
+[testenv:py37]
 deps = 
     -r requirements.txt
     coverage