udacity · byrdstephenr · Feb 24, 2025 · Mar 1, 2025
@@ -1,31 +1,42 @@
-name: Python CI
+import pickle
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import fbeta_score, precision_score, recall_score
+from ml.data import process_data
 
-on: [push]
+def train_model(X_train, y_train):
+    model = RandomForestClassifier(random_state=42)
+    model.fit(X_train, y_train)
+    return model
 
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.10"] 
+def compute_model_metrics(y, preds):
+    fbeta = fbeta_score(y, preds, beta=1, zero_division=1)
+    precision = precision_score(y, preds, zero_division=1)
+    recall = recall_score(y, preds, zero_division=1)
+    return precision, recall, fbeta
 
-    steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install flake8 pytest
-        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-    - name: Test with pytest
-      run: |
-        pytest test_ml.py -v
+def inference(model, X):
+    return model.predict(X)
+
+def save_model(model, path):
+    with open(path, 'wb') as f:
+        pickle.dump(model, f)
+
+def load_model(path):
+    with open(path, 'rb') as f:
+        return pickle.load(f)
+
+def performance_on_categorical_slice(
+    data, column_name, slice_value, categorical_features, label, encoder, lb, model
+):
+    slice_data = data[data[column_name] == slice_value]
+    X_slice, y_slice, _, _ = process_data(
+        slice_data,
+        categorical_features=categorical_features,
+        label=label,
+        training=False,
+        encoder=encoder,
+        lb=lb
+    )
+    preds = inference(model, X_slice)
+    precision, recall, fbeta = compute_model_metrics(y_slice, preds)
+    return precision, recall, fbeta
@@ -0,0 +1,27 @@
+name: Python CI
+
+on: [push]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.10"
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install flake8 pytest
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        pytest test_ml.py -v
@@ -1,14 +1,14 @@
 import json
-
 import requests
 
-# TODO: send a GET using the URL http://127.0.0.1:8000
-r = None # Your code here
+# DONE: send a GET using the URL http://127.0.0.1:8000
+local_URL = "http://127.0.0.1:8000/"
+r = requests.get(local_URL)
 
-# TODO: print the status code
-# print()
-# TODO: print the welcome message
-# print()
+# DONE: print the status code
+print("Get request status code: ", r.status_code)
+# DONE?: print the welcome message
+print("Welcome Message:", r.json())
 
 
 
@@ -26,13 +26,17 @@
     "capital-gain": 0,
     "capital-loss": 0,
     "hours-per-week": 40,
-    "native-country": "United-States",
+    "native-country": "United-States"
 }
 
-# TODO: send a POST using the data above
-r = None # Your code here
 
-# TODO: print the status code
-# print()
-# TODO: print the result
-# print()
+# DONE?: send a POST using the data above
+#r = requests.post(local_URL+"data?",
+#                         data = data)
+r2 = requests.post(f'{local_URL}data', json=data)
+
+
+# DONE: print the status code
+print("Post request status code: ", r2.status_code)
+# DONE: print the result
+print("Inference result: ", r2.json())
@@ -7,6 +7,7 @@
 from ml.data import apply_label, process_data
 from ml.model import inference, load_model
 
+
 # DO NOT MODIFY
 class Data(BaseModel):
     age: int = Field(..., example=37)
@@ -15,7 +16,9 @@ class Data(BaseModel):
     education: str = Field(..., example="HS-grad")
     education_num: int = Field(..., example=10, alias="education-num")
     marital_status: str = Field(
-        ..., example="Married-civ-spouse", alias="marital-status"
+        ...,
+        example="Married-civ-spouse",
+        alias="marital-status",
     )
     occupation: str = Field(..., example="Prof-specialty")
     relationship: str = Field(..., example="Husband")
@@ -24,26 +27,31 @@ class Data(BaseModel):
     capital_gain: int = Field(..., example=0, alias="capital-gain")
     capital_loss: int = Field(..., example=0, alias="capital-loss")
     hours_per_week: int = Field(..., example=40, alias="hours-per-week")
-    native_country: str = Field(..., example="United-States", alias="native-country")
+    native_country: str = Field(
+        ...,
+        example="United-States",
+        alias="native-country",
+    )
+
+
+project_path = "/mnt/c/Users/kaleb/Desktop/DEPLOYING-A-SCALABLE-ML-PIPELINE-WITH-FASTAPI"
 
-path = None # TODO: enter the path for the saved encoder 
-encoder = load_model(path)
+encoder_path = os.path.join(project_path, "model", "encoder.pkl")
+encoder = load_model(encoder_path)
 
-path = None # TODO: enter the path for the saved model 
-model = load_model(path)
+model_path = os.path.join(project_path, "model", "model.pkl")
+model = load_model(model_path)
+
+
+app = FastAPI()
 
-# TODO: create a RESTful API using FastAPI
-app = None # your code here
 
-# TODO: create a GET on the root giving a welcome message
 @app.get("/")
 async def get_root():
     """ Say hello!"""
-    # your code here
-    pass
+    return {"message": "Hello! Welcome to Stephen's API."}
 
 
-# TODO: create a POST on a different path that does model inference
 @app.post("/data/")
 async def post_inference(data: Data):
     # DO NOT MODIFY: turn the Pydantic model into a dict.
@@ -64,11 +72,15 @@ async def post_inference(data: Data):
         "sex",
         "native-country",
     ]
+
     data_processed, _, _, _ = process_data(
-        # your code here
-        # use data as data input
-        # use training = False
-        # do not need to pass lb as input
+        data,
+        categorical_features=cat_features,
+        training=False,
+        encoder=encoder,
     )
-    _inference = None # your code here to predict the result using data_processed
-    return {"result": apply_label(_inference)}
+
+    _inference = inference(model, data_processed)
+    return {
+        "result": apply_label(_inference)
+    }
@@ -1,7 +1,8 @@
 import pickle
 from sklearn.metrics import fbeta_score, precision_score, recall_score
 from ml.data import process_data
-# TODO: add necessary import
+from sklearn.ensemble import RandomForestClassifier  # or your chosen model
+import numpy as np
 
 # Optional: implement hyperparameter tuning.
 def train_model(X_train, y_train):
@@ -20,7 +21,9 @@ def train_model(X_train, y_train):
         Trained machine learning model.
     """
     # TODO: implement the function
-    pass
+    model = RandomForestClassifier(random_state=42)
+    model.fit(X_train, y_train)
+    return model
 
 
 def compute_model_metrics(y, preds):
@@ -60,7 +63,7 @@ def inference(model, X):
         Predictions from the model.
     """
     # TODO: implement the function
-    pass
+    return model.predict(X)
 
 def save_model(model, path):
     """ Serializes model to a file.
@@ -73,12 +76,14 @@ def save_model(model, path):
         Path to save pickle file.
     """
     # TODO: implement the function
-    pass
+    with open(path, 'wb') as f:
+        pickle.dump(model, f)
 
 def load_model(path):
     """ Loads pickle file from `path` and returns it."""
     # TODO: implement the function
-    pass
+    with open(path, 'rb') as f:
+        return pickle.load(f)
 
 
 def performance_on_categorical_slice(
@@ -117,12 +122,15 @@ def performance_on_categorical_slice(
     fbeta : float
 
     """
-    # TODO: implement the function
+    slice_data = data[data[column_name] == slice_value]
     X_slice, y_slice, _, _ = process_data(
-        # your code here
-        # for input data, use data in column given as "column_name", with the slice_value 
-        # use training = False
+        slice_data,
+        categorical_features=categorical_features,
+        label=label,
+        training=False,
+        encoder=encoder,
+        lb=lb
     )
-    preds = None # your code here to get prediction on X_slice using the inference function
+    preds = inference(model, X_slice)
     precision, recall, fbeta = compute_model_metrics(y_slice, preds)
     return precision, recall, fbeta
@@ -0,0 +1,55 @@
+# Model Card
+
+For additional information see the Model Card paper: https://arxiv.org/pdf/1810.03993.pdf
+
+## Model Details
+
+- Developed by: Stephen Byrd, Feburary 2025
+- Model Type: This model uses a **Random Forest Classifier** for binary classification.
+- Dataset: The model is trained on the **Adult Census Dataset** from the UCI Machine Learning Repository.
+
+## Intended Use
+
+- The model is intended to predict whether an individual earns more than $50,000 per year based on demographic features from the census data.
+
+## Training Data
+
+- Dataset Source: The data is extracted from the 1994 Census database.
+- Features:
+  - age
+  - workclass
+  - education
+  - education-num
+  - marital-status
+  - occupation
+  - relationship
+  - race
+  - sex
+  - capital-gain
+  - capital-loss
+  - hours-per-week
+  - native-country
+- Target Label: 'salary' with values '>50K' (1) and '<=50K' (0).
+
+## Evaluation Data
+
+- Validation Method: The model is evaluated using a test dataset split from the original data.
+- Metrics Used: Precision, Recall, F1 Score.
+
+## Metrics
+
+- Metrics Used: Precision, Recall, F1 Score.
+- Model Performance:
+  - Precision: **0.7419**
+  - Recall: **0.6384**
+  - F1 Score: **0.6863**
+
+## Ethical Considerations
+
+- The dataset may have biases towards certain demographics (e.g., more men than women, predominantly white individuals).
+- No direct human life risks are associated with this model.
+
+## Caveats and Recommendations
+
+- Limitations: The model's performance could be improved with further tuning or using different classifiers.
+- Future Work: Consider using techniques like SHAP for feature importance analysis or exploring other classification models.