Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 18 additions & 12 deletions notebooks/1.0_DataLoading.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -216,19 +216,21 @@
"[5 rows x 37 columns]"
]
},
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.read_csv('../data/Meter_A.txt', sep='\\t', header=None).dropna()\n",
"data.head()"
"data.head()\n",
"\n",
"#Reading data from csv file"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand All @@ -246,36 +248,40 @@
" [ 0.79672987, 1.01057037, 0.99902897, ..., 33.81673167,\n",
" 33.01106667, 2. ],\n",
" [ 0.79019427, 1.00419541, 0.99553749, ..., 33.66862167,\n",
" 33.11848833, 2. ]])"
" 33.11848833, 2. ]], shape=(87, 37))"
]
},
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = data.to_numpy()\n",
"\n",
"data"
"data\n",
"\n",
"#turning data into a numpy array"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def load_data(path):\n",
" data = pd.read_csv(path, sep='\\t', header=None).dropna()\n",
" print(data.head())\n",
" data = data.to_numpy()\n",
" return data"
" return data\n",
"\n",
"#loads file into a tab-separated file into a pandas DataFrame, then removes any row with one missing value but it doesn't seem to have removed any "
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 5,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -321,7 +327,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "practice1",
"display_name": "myenv",
"language": "python",
"name": "python3"
},
Expand All @@ -335,7 +341,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
"version": "3.11.14"
}
},
"nbformat": 4,
Expand Down
8 changes: 4 additions & 4 deletions notebooks/3.0_Model_built.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from src.data_preprocess import DataPreprocessing"
"from meter_Hw4.src.data_preprocess_split import DataPreprocessing"
]
},
{
Expand Down Expand Up @@ -658,7 +658,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "practice1",
"display_name": "myenv",
"language": "python",
"name": "python3"
},
Expand All @@ -672,7 +672,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
"version": "3.11.14"
}
},
"nbformat": 4,
Expand Down
485 changes: 485 additions & 0 deletions notebooks/MLPClassifier Further Analysis.ipynb

Large diffs are not rendered by default.

474 changes: 474 additions & 0 deletions notebooks/Test.ipynb

Large diffs are not rendered by default.

83 changes: 77 additions & 6 deletions src/data_preprocess.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,86 @@
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


class DataPreprocessing():
class DataPreprocessing:
def __init__(self):
pass

def load_data(self, path):
data = pd.read_csv(path, sep="\t", header=None).dropna()
# Load raw data
df = pd.read_csv(path, sep="\t", header=None).dropna()
print(df.head()) # <-- print BEFORE converting to numpy

# Convert to numpy
data = df.to_numpy()

# ---------------------------------------------------------
# 1. TRAIN / VALIDATION / TEST SPLIT
# ---------------------------------------------------------
train_validation, test = train_test_split(
data, test_size=0.2, random_state=12
)

train, validation = train_test_split(
train_validation, test_size=0.2, random_state=99
)

# ---------------------------------------------------------
# 2. CHECK CLASS DISTRIBUTION
# ---------------------------------------------------------
print("Train:", set(train[:, -1]))
print("Validation:", set(validation[:, -1]))
print("Test:", set(test[:, -1]))

# ---------------------------------------------------------
# 3. SEPARATE FEATURES AND LABELS
# ---------------------------------------------------------
X_train = train[:, :-1]
y_train = train[:, -1]

X_val = validation[:, :-1]
y_val = validation[:, -1]

X_test = test[:, :-1]
y_test = test[:, -1]

# ---------------------------------------------------------
# 4. SCALE USING ONLY TRAINING DATA
# ---------------------------------------------------------
scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# ---------------------------------------------------------
# 5. RETURN EVERYTHING CLEANLY
# ---------------------------------------------------------
return {
"X_train": X_train,
"y_train": y_train,
"X_val": X_val,
"y_val": y_val,
"X_test": X_test,
"y_test": y_test,
"X_train_scaled": X_train_scaled,
"X_val_scaled": X_val_scaled,
"X_test_scaled": X_test_scaled,
"scaler": scaler,
"raw_data": data
}

'''
***How to Use:

from data_preprocess import DataPreprocessing

print(data.head())
pre = DataPreprocessing()
data_dict = pre.load_data("data/Meter_A.txt")

data = data.to_numpy()
X_train = data_dict["X_train"]
y_train = data_dict["y_train"]

return data
X_train_scaled = data_dict["X_train_scaled"]
'''
46 changes: 46 additions & 0 deletions src/mlpclassifier_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

class MLPClassificationModel:
def __init__(self, hidden_layer_sizes, learning_rate_init, max_iter):
self.model = MLPClassifier(
hidden_layer_sizes=hidden_layer_sizes,
learning_rate_init=learning_rate_init,
max_iter=max_iter,
solver="sgd", # <---Now using sgd b/c "adam" gave all same answers for all iteration
momentum=0.9,
learning_rate="adaptive",
random_state=42
)

def train(self, X_train, y_train):
self.model.fit(X_train, y_train)


def predict(self, X_test):
return self.model.predict(X_test)


def evaluate(self, X_test, y_test):
y_pred = self.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

return y_pred

'''
***How to use:

clf = MLPClassificationModel(
hidden_layer_sizes=(64, 32),
learning_rate_init=0.001,
max_iter=300
)

clf.train(X_train_scaled, y_train)
y_pred = clf.evaluate(X_test_scaled, y_test)
'''
56 changes: 56 additions & 0 deletions src/mlpclassifier_visualization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

class ConfusionMatrixVisualizer:
def __init__(self, y_test, y_pred):
self.y_test = y_test
self.y_pred = y_pred
self.cm = confusion_matrix(y_test, y_pred)

def plot_confusion_matrix(self):
plt.figure(figsize=(6, 5))
sns.heatmap(
self.cm,
annot=True,
fmt="d",
cmap="Blues",
xticklabels=["Predicted 1", "Predicted 2"],
yticklabels=["Actual 1", "Actual 2"]
)
plt.title("Confusion Matrix")
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.tight_layout()
plt.show()

def plot_performance_scatter(self):
plt.figure(figsize=(8, 6))
plt.scatter(self.y_test, self.y_pred, alpha=0.6)
plt.xlabel("Actual Class")
plt.ylabel("Predicted Class")
plt.title("MLPClassifier Performance")
plt.grid(True)
plt.show()

'''
***How to use:

# Create model with your chosen hyperparameters
clf = MLPClassificationModel(
hidden_layer_sizes=(64, 32),
learning_rate_init=0.001,
max_iter=300
)

# Train
clf.train(X_train_scaled, y_train)

# Evaluate
y_pred = clf.evaluate(X_test_scaled, y_test)

# Visualize
viz = ConfusionMatrixVisualizer(y_test, y_pred)
viz.plot_confusion_matrix()
viz.plot_performance_scatter()
'''
2 changes: 1 addition & 1 deletion src/model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from sklearn.metrics import accuracy_score

# Importing the parent: DataPreprocessing class from data_preprocess.py
from src.data_preprocess import DataPreprocessing
from meter_Hw4.src.data_preprocess import DataPreprocessing


class ModelBuilder(DataPreprocessing):
Expand Down
Loading