gabi107 · ldiaz225 · Mar 6, 2026 · Mar 7, 2026 · Mar 7, 2026 · Mar 7, 2026
diff --git a/notebooks/1.0_DataLoading.ipynb b/notebooks/1.0_DataLoading.ipynb
@@ -18,7 +18,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -216,19 +216,21 @@
        "[5 rows x 37 columns]"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "data = pd.read_csv('../data/Meter_A.txt', sep='\\t', header=None).dropna()\n",
-    "data.head()"
+    "data.head()\n",
+    "\n",
+    "#Reading data from csv file"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -246,36 +248,40 @@
        "       [ 0.79672987,  1.01057037,  0.99902897, ..., 33.81673167,\n",
        "        33.01106667,  2.        ],\n",
        "       [ 0.79019427,  1.00419541,  0.99553749, ..., 33.66862167,\n",
-       "        33.11848833,  2.        ]])"
+       "        33.11848833,  2.        ]], shape=(87, 37))"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "data = data.to_numpy()\n",
     "\n",
-    "data"
+    "data\n",
+    "\n",
+    "#turning data into a numpy array"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "def load_data(path):\n",
     "    data = pd.read_csv(path, sep='\\t', header=None).dropna()\n",
     "    print(data.head())\n",
     "    data = data.to_numpy()\n",
-    "    return data"
+    "    return data\n",
+    "\n",
+    "#loads file into a tab-separated file into a pandas DataFrame, then removes any row with one missing value but it doesn't seem to have removed any "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -321,7 +327,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "practice1",
+   "display_name": "myenv",
    "language": "python",
    "name": "python3"
   },
@@ -335,7 +341,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.11.14"
   }
  },
  "nbformat": 4,

diff --git a/notebooks/3.0_Model_built.ipynb b/notebooks/3.0_Model_built.ipynb
@@ -27,11 +27,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from src.data_preprocess import DataPreprocessing"
+    "from meter_Hw4.src.data_preprocess_split import DataPreprocessing"
    ]
   },
   {
@@ -658,7 +658,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "practice1",
+   "display_name": "myenv",
    "language": "python",
    "name": "python3"
   },
@@ -672,7 +672,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.11.14"
   }
  },
  "nbformat": 4,

diff --git a/notebooks/MLPClassifier Further Analysis.ipynb b/notebooks/MLPClassifier Further Analysis.ipynb
diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb
diff --git a/src/data_preprocess.py b/src/data_preprocess.py
@@ -1,15 +1,86 @@
 import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
 
-
-class DataPreprocessing():
+class DataPreprocessing:
     def __init__(self):
         pass
 
     def load_data(self, path):
-        data = pd.read_csv(path, sep="\t", header=None).dropna()
+        # Load raw data
+        df = pd.read_csv(path, sep="\t", header=None).dropna()
+        print(df.head())  # <-- print BEFORE converting to numpy
+
+        # Convert to numpy
+        data = df.to_numpy()
+
+        # ---------------------------------------------------------
+        # 1. TRAIN / VALIDATION / TEST SPLIT
+        # ---------------------------------------------------------
+        train_validation, test = train_test_split(
+            data, test_size=0.2, random_state=12
+        )
+
+        train, validation = train_test_split(
+            train_validation, test_size=0.2, random_state=99
+        )
+
+        # ---------------------------------------------------------
+        # 2. CHECK CLASS DISTRIBUTION
+        # ---------------------------------------------------------
+        print("Train:", set(train[:, -1]))
+        print("Validation:", set(validation[:, -1]))
+        print("Test:", set(test[:, -1]))
+
+        # ---------------------------------------------------------
+        # 3. SEPARATE FEATURES AND LABELS
+        # ---------------------------------------------------------
+        X_train = train[:, :-1]
+        y_train = train[:, -1]
+
+        X_val = validation[:, :-1]
+        y_val = validation[:, -1]
+
+        X_test = test[:, :-1]
+        y_test = test[:, -1]
+
+        # ---------------------------------------------------------
+        # 4. SCALE USING ONLY TRAINING DATA
+        # ---------------------------------------------------------
+        scaler = StandardScaler().fit(X_train)
+
+        X_train_scaled = scaler.transform(X_train)
+        X_val_scaled   = scaler.transform(X_val)
+        X_test_scaled  = scaler.transform(X_test)
+
+        # ---------------------------------------------------------
+        # 5. RETURN EVERYTHING CLEANLY
+        # ---------------------------------------------------------
+        return {
+            "X_train": X_train,
+            "y_train": y_train,
+            "X_val": X_val,
+            "y_val": y_val,
+            "X_test": X_test,
+            "y_test": y_test,
+            "X_train_scaled": X_train_scaled,
+            "X_val_scaled": X_val_scaled,
+            "X_test_scaled": X_test_scaled,
+            "scaler": scaler,
+            "raw_data": data
+        }
+
+'''
+***How to Use: 
+
+from data_preprocess import DataPreprocessing
 
-        print(data.head())
+pre = DataPreprocessing()
+data_dict = pre.load_data("data/Meter_A.txt")
 
-        data = data.to_numpy()
+X_train = data_dict["X_train"]
+y_train = data_dict["y_train"]
 
-        return data
+X_train_scaled = data_dict["X_train_scaled"]
+'''
diff --git a/src/mlpclassifier_test.py b/src/mlpclassifier_test.py
@@ -0,0 +1,46 @@
+from sklearn.neural_network import MLPClassifier
+from sklearn.metrics import accuracy_score, classification_report
+
+class MLPClassificationModel:
+    def __init__(self, hidden_layer_sizes, learning_rate_init, max_iter):
+        self.model = MLPClassifier(
+            hidden_layer_sizes=hidden_layer_sizes,
+            learning_rate_init=learning_rate_init,
+            max_iter=max_iter,
+            solver="sgd",              # <---Now using sgd b/c "adam" gave all same answers for all iteration
+            momentum=0.9,              
+            learning_rate="adaptive",  
+            random_state=42
+        )
+
+    def train(self, X_train, y_train):
+        self.model.fit(X_train, y_train)
+
+
+    def predict(self, X_test):
+        return self.model.predict(X_test)
+
+
+    def evaluate(self, X_test, y_test):
+        y_pred = self.predict(X_test)
+
+        accuracy = accuracy_score(y_test, y_pred)
+        print("Accuracy:", accuracy)
+
+        print("\nClassification Report:")
+        print(classification_report(y_test, y_pred))
+
+        return y_pred
+
+'''
+***How to use:
+
+clf = MLPClassificationModel(
+    hidden_layer_sizes=(64, 32),
+    learning_rate_init=0.001,
+    max_iter=300
+)
+
+clf.train(X_train_scaled, y_train)
+y_pred = clf.evaluate(X_test_scaled, y_test)
+'''
diff --git a/src/mlpclassifier_visualization.py b/src/mlpclassifier_visualization.py
@@ -0,0 +1,56 @@
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.metrics import confusion_matrix
+
+class ConfusionMatrixVisualizer:
+    def __init__(self, y_test, y_pred):
+        self.y_test = y_test
+        self.y_pred = y_pred
+        self.cm = confusion_matrix(y_test, y_pred)
+
+    def plot_confusion_matrix(self):
+        plt.figure(figsize=(6, 5))
+        sns.heatmap(
+            self.cm,
+            annot=True,
+            fmt="d",
+            cmap="Blues",
+            xticklabels=["Predicted 1", "Predicted 2"],
+            yticklabels=["Actual 1", "Actual 2"]
+        )
+        plt.title("Confusion Matrix")
+        plt.ylabel("True Label")
+        plt.xlabel("Predicted Label")
+        plt.tight_layout()
+        plt.show()
+
+    def plot_performance_scatter(self):
+        plt.figure(figsize=(8, 6))
+        plt.scatter(self.y_test, self.y_pred, alpha=0.6)
+        plt.xlabel("Actual Class")
+        plt.ylabel("Predicted Class")
+        plt.title("MLPClassifier Performance")
+        plt.grid(True)
+        plt.show()
+
+'''
+***How to use:
+
+# Create model with your chosen hyperparameters
+clf = MLPClassificationModel(
+    hidden_layer_sizes=(64, 32),
+    learning_rate_init=0.001,
+    max_iter=300
+)
+
+# Train
+clf.train(X_train_scaled, y_train)
+
+# Evaluate
+y_pred = clf.evaluate(X_test_scaled, y_test)
+
+# Visualize
+viz = ConfusionMatrixVisualizer(y_test, y_pred)
+viz.plot_confusion_matrix()
+viz.plot_performance_scatter()
+'''
diff --git a/src/model_builder.py b/src/model_builder.py
@@ -3,7 +3,7 @@
 from sklearn.metrics import accuracy_score
 
 # Importing the parent: DataPreprocessing class from data_preprocess.py
-from src.data_preprocess import DataPreprocessing 
+from meter_Hw4.src.data_preprocess import DataPreprocessing 
 
 
 class ModelBuilder(DataPreprocessing):