Skip to content

Commit

Permalink
Changes for formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
imanzaf committed Jun 20, 2023
1 parent 5c6e9fe commit 459289b
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 32 deletions.
4 changes: 2 additions & 2 deletions get_slice_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@
test = train_test_split(df, test_size=0.20, random_state=42)[1]

# Transform data
categorical_cols = [x for x in test.columns if x not in
test.select_dtypes(include=np.number).columns.tolist() and x not in ['salary']]
categorical_cols = [x for x in test.columns if x not in test.select_dtypes(
include=np.number).columns.tolist() and x not in ['salary']]

X_test, y_test = dt.process_data(test,
categorical_features=categorical_cols,
Expand Down
37 changes: 21 additions & 16 deletions src/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
Functions to clean and preprocess data
'''

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder

Expand All @@ -26,29 +25,35 @@ def clean_data(df):

return df


def process_data(
df, categorical_features=[], label=None, training=True, encoder=None, lb=None
):
df,
categorical_features=[],
label=None,
training=True,
encoder=None,
lb=None):
"""
Process the data used in the machine learning pipeline.
(function taken from udacity starter code)
Processes the data using one hot encoding for the categorical features and a
label binarizer for the labels. This can be used in either training or
inference/validation.
Processes the data using one hot encoding for the categorical
features and a label binarizer for the labels. This can be used
in either training or inference/validation.
Note: depending on the type of model used, you may want to add in functionality that
scales the continuous data.
Note: depending on the type of model used, you may want to add
in functionality that scales the continuous data.
Inputs
------
X : pd.DataFrame
Dataframe containing the features and label. Columns in `categorical_features`
df : pd.DataFrame
Dataframe containing the features and label.
Columns in `categorical_features`
categorical_features: list[str]
List containing the names of the categorical features (default=[])
label : str
Name of the label column in `X`. If None, then an empty array will be returned
for y (default=None)
Name of the label column in `X`. If None, then an empty array
will be returned for y (default=None)
training : bool
Indicator if training mode or inference/validation mode.
encoder : sklearn.preprocessing._encoders.OneHotEncoder
Expand All @@ -63,11 +68,11 @@ def process_data(
y : np.array
Processed labels if labeled=True, otherwise empty np.array.
encoder : sklearn.preprocessing._encoders.OneHotEncoder
Trained OneHotEncoder if training is True, otherwise returns the encoder passed
in.
Trained OneHotEncoder if training is True,
otherwise returns the encoder passed in.
lb : sklearn.preprocessing._label.LabelBinarizer
Trained LabelBinarizer if training is True, otherwise returns the binarizer
passed in.
Trained LabelBinarizer if training is True,
otherwise returns the binarizer passed in.
"""

if label is not None:
Expand Down
13 changes: 10 additions & 3 deletions src/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,13 @@ def slice_metrics(feature, X, y, preds):
:returns metrics: df with metrics for each slice
"""
# initialize empty df for metrics
metrics = pd.DataFrame(columns=['precision', 'recall', 'fbeta', 'accuracy'],
index=X[feature].unique().tolist())
metrics = pd.DataFrame(
columns=[
'precision',
'recall',
'fbeta',
'accuracy'],
index=X[feature].unique().tolist())

# join features and label / preds
df = X.copy()
Expand All @@ -118,6 +123,8 @@ def slice_metrics(feature, X, y, preds):
accuracy = accuracy_score(y_true, y_pred)

# add metrics to df
metrics.loc[slice] = pd.Series({'precision': precision, 'recall': recall, 'fbeta': fbeta, 'accuracy': accuracy})
metrics.loc[slice] = pd.Series(
{'precision': precision, 'recall': recall,
'fbeta': fbeta, 'accuracy': accuracy})

return metrics
23 changes: 12 additions & 11 deletions train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,18 @@
and x not in ['salary']]

# Transform data
X_train, y_train, encoder, lb = dt.process_data(train,
categorical_features=categorical_cols,
label='salary',
training=True)

X_test, y_test, encoder, lb = dt.process_data(test,
categorical_features=categorical_cols,
label='salary',
encoder=encoder,
lb=lb,
training=False)
X_train, y_train, encoder, lb = \
dt.process_data(train,
categorical_features=categorical_cols,
label='salary', training=True)

X_test, y_test, encoder, lb = \
dt.process_data(test,
categorical_features=categorical_cols,
label='salary',
encoder=encoder,
lb=lb,
training=False)

# Train model
lgbm = ml.train_model(X_train, y_train)
Expand Down

0 comments on commit 459289b

Please sign in to comment.