Changes for formatting

imanzaf · Jun 20, 2023 · 459289b · 459289b
1 parent 5c6e9fe
commit 459289b
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 32 deletions.
diff --git a/get_slice_metrics.py b/get_slice_metrics.py
@@ -23,8 +23,8 @@
 test = train_test_split(df, test_size=0.20, random_state=42)[1]
 
 # Transform data
-categorical_cols = [x for x in test.columns if x not in
-                        test.select_dtypes(include=np.number).columns.tolist() and x not in ['salary']]
+categorical_cols = [x for x in test.columns if x not in test.select_dtypes(
+    include=np.number).columns.tolist() and x not in ['salary']]
 
 X_test, y_test = dt.process_data(test,
                                  categorical_features=categorical_cols,

diff --git a/src/data.py b/src/data.py
@@ -2,7 +2,6 @@
 Functions to clean and preprocess data
 '''
 
-import pandas as pd
 import numpy as np
 from sklearn.preprocessing import LabelBinarizer, OneHotEncoder
 
@@ -26,29 +25,35 @@ def clean_data(df):
 
     return df
 
+
 def process_data(
-    df, categorical_features=[], label=None, training=True, encoder=None, lb=None
-):
+        df,
+        categorical_features=[],
+        label=None,
+        training=True,
+        encoder=None,
+        lb=None):
     """
     Process the data used in the machine learning pipeline.
     (function taken from udacity starter code)
 
-    Processes the data using one hot encoding for the categorical features and a
-    label binarizer for the labels. This can be used in either training or
-    inference/validation.
+    Processes the data using one hot encoding for the categorical
+    features and a label binarizer for the labels. This can be used
+    in either training or inference/validation.
 
-    Note: depending on the type of model used, you may want to add in functionality that
-    scales the continuous data.
+    Note: depending on the type of model used, you may want to add
+    in functionality that scales the continuous data.
 
     Inputs
     ------
-    X : pd.DataFrame
-        Dataframe containing the features and label. Columns in `categorical_features`
+    df : pd.DataFrame
+        Dataframe containing the features and label.
+        Columns in `categorical_features`
     categorical_features: list[str]
         List containing the names of the categorical features (default=[])
     label : str
-        Name of the label column in `X`. If None, then an empty array will be returned
-        for y (default=None)
+        Name of the label column in `X`. If None, then an empty array
+        will be returned for y (default=None)
     training : bool
         Indicator if training mode or inference/validation mode.
     encoder : sklearn.preprocessing._encoders.OneHotEncoder
@@ -63,11 +68,11 @@ def process_data(
     y : np.array
         Processed labels if labeled=True, otherwise empty np.array.
     encoder : sklearn.preprocessing._encoders.OneHotEncoder
-        Trained OneHotEncoder if training is True, otherwise returns the encoder passed
-        in.
+        Trained OneHotEncoder if training is True,
+        otherwise returns the encoder passed in.
     lb : sklearn.preprocessing._label.LabelBinarizer
-        Trained LabelBinarizer if training is True, otherwise returns the binarizer
-        passed in.
+        Trained LabelBinarizer if training is True,
+        otherwise returns the binarizer passed in.
     """
 
     if label is not None:

diff --git a/src/model.py b/src/model.py
@@ -96,8 +96,13 @@ def slice_metrics(feature, X, y, preds):
     :returns metrics: df with metrics for each slice
     """
     # initialize empty df for metrics
-    metrics = pd.DataFrame(columns=['precision', 'recall', 'fbeta', 'accuracy'],
-                           index=X[feature].unique().tolist())
+    metrics = pd.DataFrame(
+        columns=[
+            'precision',
+            'recall',
+            'fbeta',
+            'accuracy'],
+        index=X[feature].unique().tolist())
 
     # join features and label / preds
     df = X.copy()
@@ -118,6 +123,8 @@ def slice_metrics(feature, X, y, preds):
         accuracy = accuracy_score(y_true, y_pred)
 
         # add metrics to df
-        metrics.loc[slice] = pd.Series({'precision': precision, 'recall': recall, 'fbeta': fbeta, 'accuracy': accuracy})
+        metrics.loc[slice] = pd.Series(
+            {'precision': precision, 'recall': recall,
+             'fbeta': fbeta, 'accuracy': accuracy})
 
     return metrics
diff --git a/train_model.py b/train_model.py
@@ -27,17 +27,18 @@
                         and x not in ['salary']]
 
     # Transform data
-    X_train, y_train, encoder, lb = dt.process_data(train,
-                                                    categorical_features=categorical_cols,
-                                                    label='salary',
-                                                    training=True)
-
-    X_test, y_test, encoder, lb = dt.process_data(test,
-                                                  categorical_features=categorical_cols,
-                                                  label='salary',
-                                                  encoder=encoder,
-                                                  lb=lb,
-                                                  training=False)
+    X_train, y_train, encoder, lb = \
+        dt.process_data(train,
+                        categorical_features=categorical_cols,
+                        label='salary', training=True)
+
+    X_test, y_test, encoder, lb = \
+        dt.process_data(test,
+                        categorical_features=categorical_cols,
+                        label='salary',
+                        encoder=encoder,
+                        lb=lb,
+                        training=False)
 
     # Train model
     lgbm = ml.train_model(X_train, y_train)