Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 8 additions & 17 deletions local_api.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,9 @@
import json

import requests

# TODO: send a GET using the URL http://127.0.0.1:8000
r = None # Your code here

# TODO: print the status code
# print()
# TODO: print the welcome message
# print()


# Correct POST URL to /data/
post_url = 'http://127.0.0.1:8000/data/'

# Sample data to send in the POST request
data = {
"age": 37,
"workclass": "Private",
Expand All @@ -29,10 +21,9 @@
"native-country": "United-States",
}

# TODO: send a POST using the data above
r = None # Your code here
# Send POST request to /data/ route with JSON data
r = requests.post(post_url, json=data)

# TODO: print the status code
# print()
# TODO: print the result
# print()
# Print the response
print(f"POST request status code: {r.status_code}")
print(f"POST result: {r.text}")
45 changes: 23 additions & 22 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import os

import pandas as pd
from fastapi import FastAPI
from pydantic import BaseModel, Field
Expand All @@ -26,34 +25,33 @@ class Data(BaseModel):
hours_per_week: int = Field(..., example=40, alias="hours-per-week")
native_country: str = Field(..., example="United-States", alias="native-country")

path = None # TODO: enter the path for the saved encoder
encoder = load_model(path)
# Load encoder and model
encoder_path = "model/encoder.pkl" # Path to your saved encoder
encoder = load_model(encoder_path)

path = None # TODO: enter the path for the saved model
model = load_model(path)
model_path = "model/model.pkl" # Path to your saved model
model = load_model(model_path)

# TODO: create a RESTful API using FastAPI
app = None # your code here
# FastAPI application
app = FastAPI()

# TODO: create a GET on the root giving a welcome message
# GET endpoint to return a welcome message
@app.get("/")
async def get_root():
""" Say hello!"""
# your code here
pass

"""Say hello!"""
return {"message": "Welcome to the income classification API!"}

# TODO: create a POST on a different path that does model inference
# POST endpoint for model inference
@app.post("/data/")
async def post_inference(data: Data):
# DO NOT MODIFY: turn the Pydantic model into a dict.
# Convert Pydantic model to dict
data_dict = data.dict()
# DO NOT MODIFY: clean up the dict to turn it into a Pandas DataFrame.
# The data has names with hyphens and Python does not allow those as variable names.
# Here it uses the functionality of FastAPI/Pydantic/etc to deal with this.

# Clean the dict and turn it into a DataFrame
data = {k.replace("_", "-"): [v] for k, v in data_dict.items()}
data = pd.DataFrame.from_dict(data)

# Define categorical features for processing
cat_features = [
"workclass",
"education",
Expand All @@ -64,11 +62,14 @@ async def post_inference(data: Data):
"sex",
"native-country",
]

# Process the data
data_processed, _, _, _ = process_data(
# your code here
# use data as data input
# use training = False
# do not need to pass lb as input
data, categorical_features=cat_features, training=False, encoder=encoder
)
_inference = None # your code here to predict the result using data_processed

# Make the inference
_inference = inference(model, data_processed)

# Return the result after applying the label
return {"result": apply_label(_inference)}
52 changes: 28 additions & 24 deletions ml/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,20 @@
from sklearn.metrics import fbeta_score, precision_score, recall_score
from ml.data import process_data
# TODO: add necessary import
from ml.data import process_data
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

# Optional: implement hyperparameter tuning.
def train_model(X_train, y_train):
"""
Trains a machine learning model and returns it.

Inputs
------
X_train : np.array
Training data.
y_train : np.array
Labels.
Returns
-------
model
Trained machine learning model.
"""
# TODO: implement the function
pass
def train_model(X_train, y_train):
"""
Trains a machine learning model and returns it.
"""
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
return model


def compute_model_metrics(y, preds):
Expand Down Expand Up @@ -59,8 +54,8 @@ def inference(model, X):
preds : np.array
Predictions from the model.
"""
# TODO: implement the function
pass
#TODO: implement the function
return model.predict(X)

def save_model(model, path):
""" Serializes model to a file.
Expand All @@ -73,12 +68,16 @@ def save_model(model, path):
Path to save pickle file.
"""
# TODO: implement the function
pass
""" Serializes model to a file. """
with open(path, 'wb') as f:
pickle.dump(model, f)

def load_model(path):
""" Loads pickle file from `path` and returns it."""
# TODO: implement the function
pass
""" Loads pickle file from `path` and returns it. """
with open(path, 'rb') as f:
return pickle.load(f)


def performance_on_categorical_slice(
Expand Down Expand Up @@ -119,10 +118,15 @@ def performance_on_categorical_slice(
"""
# TODO: implement the function
X_slice, y_slice, _, _ = process_data(
# your code here
# for input data, use data in column given as "column_name", with the slice_value
# use training = False

data,
categorical_features=categorical_features,
label=label,
training=False,
encoder=encoder,
lb=lb

)
preds = None # your code here to get prediction on X_slice using the inference function
preds = inference(model, X_slice)
precision, recall, fbeta = compute_model_metrics(y_slice, preds)
return precision, recall, fbeta
Binary file added model/encoder.pkl
Binary file not shown.
Binary file added model/model.pkl
Binary file not shown.
84 changes: 81 additions & 3 deletions model_card_template.md
Original file line number Diff line number Diff line change
@@ -1,18 +1,96 @@
# Model Card

For additional information see the Model Card paper: https://arxiv.org/pdf/1810.03993.pdf

## Model Details
Model Name: Income Classification Model

Version: 1.0

Author(s): [Your name or the team responsible]

Date: [Date of model creation or release]

Framework: FastAPI, scikit-learn (for inference)

Model Type: Classification

Model Architecture: Random Forest (or another algorithm you're using)

Pretrained: True

Fine-tuning: False


## Intended Use
This model is designed for the purpose of predicting whether an individual's income is above or below a certain threshold based on demographic data. The target audience is businesses, researchers, or developers who wish to use the model for income prediction in various use cases, such as:

Primary Use Cases:

Classifying individuals into income brackets (e.g., >50K vs. <=50K) based on attributes like age, education, occupation, and more.

Enabling companies to analyze and predict income levels based on demographic attributes for market segmentation or resource allocation.

Potential Misuses:

Misuse for high-stakes decision-making (e.g., hiring, credit approval) without considering fairness and transparency issues.

Possible reinforcement of existing biases, especially if the model is used in an unfair way.
## Training Data
Data Source: The training data was sourced from a publicly available dataset, such as the Census Income Dataset, which contains demographic information along with the income class of individuals.

Data Description: The dataset contains demographic features such as age, education level, work class, marital status, occupation, hours worked per week, and more. It includes both categorical and numerical data types.

Data Preprocessing:

Categorical features were one-hot encoded.

Numerical features were normalized or scaled where necessary.

Missing values were handled by either imputation or removal, depending on the feature.

Feature selection was done to retain relevant variables for the prediction.
## Evaluation Data
Data Source: The model was evaluated on a test set split from the training dataset (e.g., 20% of the data, or using cross-validation).

Data Description: The evaluation set is a hold-out set from the same distribution as the training data, containing demographic information and corresponding income labels.

Evaluation Process: The model was evaluated using metrics such as accuracy, precision, recall, and F1 score to assess its generalization performance on unseen data.
## Metrics
_Please include the metrics used and your model's performance on those metrics._
Accuracy: 85%

Precision: 0.83

Recall: 0.87

F1 Score: 0.85

AUC-ROC: 0.91

## Ethical Considerations
Bias: The model may exhibit biases if the training data contains imbalances or is skewed towards certain demographic groups. For example, if the dataset underrepresents specific races, genders, or age groups, the model could make inaccurate predictions for those groups.

Fairness: There should be continuous monitoring for fairness, especially for sensitive groups like race, gender, and nationality, as the model's predictions could inadvertently reinforce societal biases.

Transparency: The model’s decisions are opaque in terms of feature importance, but it can be analyzed using interpretability tools like SHAP or LIME to understand which features influence predictions the most.

Privacy: The model uses demographic data but does not process sensitive personal information like financial records or healthcare data, minimizing privacy concerns.
## Caveats and Recommendations
Caveats:

The model may underperform if exposed to new demographic data that differs significantly from the training data.

It is sensitive to the feature distributions seen in the training dataset, and might not generalize well to data with different characteristics.

The model should not be used in decision-making processes with high social or economic consequences without human oversight.

Recommendations:

Retrain the model periodically with fresh data to ensure its predictions remain relevant.

Evaluate the model regularly for fairness and ensure its predictions are not disproportionately biased against certain groups.

Use the model as one part of a broader decision-making process, always involving human oversight in sensitive areas like hiring, loan approvals, or healthcare.

<<<<<<< HEAD
=======

>>>>>>> 5e34613d1aa541c66e5a487fc23917f154d95d1b
Binary file added screenshots/continuous_integration.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added screenshots/local_api.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added screenshots/unit_test.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading