Skip to content

Feature/final exam #40

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 10 additions & 15 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,13 @@
FROM python:2-alpine
from sklearn.model_selection import train_test_split

COPY ./requirements.txt /app/requirements.txt
def split_train_dev_test(X, y, test_size, dev_size):
# Split into train+dev and test sets first
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=test_size)

WORKDIR /app
# Compute actual dev size relative to the combined train+dev set
actual_dev_size = dev_size / (1 - test_size)

# Split the train+dev set into separate training and dev sets
X_train, X_dev, y_train, y_dev = train_test_split(X_temp, y_temp, test_size=actual_dev_size)

RUN apk --update add python py-pip openssl ca-certificates py-openssl wget bash linux-headers
RUN apk --update add --virtual build-dependencies libffi-dev openssl-dev python-dev py-pip build-base \
&& pip install --upgrade pip \
&& pip install --upgrade pipenv\
&& pip install --upgrade -r /app/requirements.txt\
&& apk del build-dependencies

COPY . /app

ENTRYPOINT [ "python" ]

CMD [ "hello.py" ]
return X_train, X_dev, X_test, y_train, y_dev, y_test
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Getting Started with Python on IBM Cloud


To get started, we'll take you through a sample Python Flask app, help you set up a development environment, deploy to IBM Cloud and add a Cloudant database.

The following instructions are for deploying the application as a Cloud Foundry application. To deploy as a container to **IBM Cloud Kubernetes Service** instead, [see README-kubernetes.md](README-kubernetes.md)
Expand Down
20 changes: 20 additions & 0 deletions ans2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from joblib import dump
solvers = ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
for solver in solvers:
model = LogisticRegression(solver=solver)
model.fit(X_train, y_train)

# Evaluate the model
predictions = model.predict(X_test)
report = classification_report(y_test, predictions)
print(f"Performance with solver {solver}:\n{report}")

# Save the model
model_filename = f"{roll_no}_lr_{solver}.joblib"
dump(model, model_filename)


50 changes: 50 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
if file:
image = _read_image(Image.open(file))
model_path = "models/best_model_C-1_gamma-0.001.joblib"
model_path = "./model/best_model_C-1_gamma-10.joblib"
model = joblib.load(model_path)
prediction = model.predict(image)
return jsonify({"prediction": str(prediction[0])})
else:
return jsonify({"error": "Invalid file format"})


@app.route("/prediction", methods=["POST"])
def prediction():
@app.route("/prediction/<model_type>", methods=["POST"])
def prediction(model_type):
if model_type not in ["svm", "tree", "lr"]:
return jsonify({"error": "Invalid model type"})
else:
model = load_model(model_type)
data_json = request.json
if data_json:
data_dict = json.loads(data_json)
image = np.array([data_dict["image"]])
model_path = "models/best_model_C-1_gamma-0.001.joblib"
model = joblib.load(model_path)
# model_path = "models/best_model_C-1_gamma-0.001.joblib"
# model = joblib.load(model_path)
try:
prediction = model.predict(image)
return jsonify({"prediction": str(prediction[0])})
@@ -123,7 +127,18 @@ def prediction():
return jsonify({"error": "Invalid data format"})


def load_model(model_type="svm"):
if model_type == "svm":
model_path = "./models/best_model_C-1_gamma-10.joblib"
elif model_type == "tree":
model_path = "./models/best_model_max_depth-15.joblib"
elif model_type == "lr":
model_path = "./models/best_model_solver-lbfgs.joblib"
model = joblib.load(model_path)
return model


if __name__ == "__main__":
print("server is running")
#check
# check
app.run(host="0.0.0.0", port=8000)
15 changes: 15 additions & 0 deletions hello.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,18 @@ def shutdown():

if __name__ == '__main__':
app.run(host='0.0.0.0', port=port, debug=True)





# Calculate the total number of samples
total_samples = len(train_data) + len(test_data) + len(dev_data)

# Get the image dimensions (assuming all images have the same size)
image_height, image_width = train_data[0].shape[:2] # Assuming the shape is (height, width, channels)

# Print statements
print(f"Total number of samples in the dataset: {total_samples}")
print(f"Size of the images in the dataset: {image_height}x{image_width}")

19 changes: 19 additions & 0 deletions image_resize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import numpy as np
from skimage.transform import resize

def resize_images(images, size):
"""Resizes images to the given size.

Args:
images: A numpy array of images.
size: The target size of the images.

Returns:
A numpy array of resized images.
"""

resized_images = np.zeros((images.shape[0], size[0], size[1], images.shape[3]))
for i in range(images.shape[0]):
resized_images[i] = resize(images[i], size, order=3)
return resized_images

139 changes: 139 additions & 0 deletions test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import utils
from joblib import load
import os
import json
from app import app
from sklearn import datasets
import numpy as np


def create_dummy_dataset():
X, y = utils.read_digits()
X_train = X[:100, :, :]
y_train = y[:100]
X_dev = X[:50, :, :]
y_dev = y[:50]
X_train = utils.preprocess_data(X_train)
X_dev = utils.preprocess_data(X_dev)
return X_train, y_train, X_dev, y_dev


def create_dummy_hparams():
return {
"gamma": [0.001, 0.01, 0.1, 1, 10, 100],
"C": [0.1, 1, 2, 5, 10],
}


def create_dummy_lr_hparams():
return {
"solver": ["newton-cg"],
}


def test_hparams_combinations():
# a test case to check all possible combinations of hyper parameters
h_params_grid = create_dummy_hparams()
h_param_combinations = utils.get_combinations_with_keys(h_params_grid)

assert len(h_param_combinations) == len(h_params_grid["gamma"]) * len(
h_params_grid["C"]
)


def test_hparams_combinations_values():
# a test case to check all possible combinations of hyper parameters values
h_params_grid = create_dummy_hparams()
h_param_combinations = utils.get_combinations_with_keys(h_params_grid)

assert len(h_param_combinations) == len(h_params_grid["gamma"]) * len(
h_params_grid["C"]
)
expected_parma_combo_1 = {"gamma": 0.001, "C": 1}
expected_parma_combo_2 = {"gamma": 0.01, "C": 1}

assert expected_parma_combo_1 in h_param_combinations
assert expected_parma_combo_2 in h_param_combinations


def test_data_splitting():
X, y = utils.read_digits()
X = X[:100, :, :]
y = y[:100]
test_size = 0.1
dev_size = 0.6
train_size = 1 - (test_size + dev_size)
print(train_size)
(
X_train,
X_dev,
X_test,
y_train,
y_dev,
y_test,
) = utils.split_train_dev_test(X, y, test_size=test_size, dev_size=dev_size)
print(f"{len(X_train)},{len(X_dev)},{len(X_test)}")
assert len(X_train) + len(X_dev) + len(X_test) == 100
assert len(y_train) + len(y_dev) + len(y_test) == 100
assert 29 <= (len(X_train)) <= 31
assert 29 <= (len(y_train)) <= 31
assert 59 <= (len(X_dev)) <= 61
assert 59 <= (len(y_dev)) <= 61
assert 9 <= (len(X_test)) <= 11
assert 9 <= (len(y_test)) <= 11


def test_is_model_saved():
X_train, y_train, X_dev, y_dev = create_dummy_dataset()
h_params_grid = create_dummy_hparams()
best_model_path, _, accuracy = utils.tune_hparams(
X_train, X_dev, y_train, y_dev, h_params_grid, "svm"
)
assert os.path.exists(best_model_path)
assert os.path.getsize(best_model_path) > 0
assert best_model_path.endswith(".joblib")
best_model = load(best_model_path)
assert best_model is not None
assert accuracy == utils.predict_and_eval(best_model, X_dev, y_dev)


def test_get_root():
response = app.test_client().get("/")
assert response.status_code == 200


def test_prediction():
digits = datasets.load_digits()

image_digits = {i: [] for i in range(10)}

for image, label in zip(digits.images, digits.target):
image_digits[label].append(image)
assert len(image_digits) == 10
for key in image_digits.keys():
image_array = utils.preprocess_data(np.array([(image_digits[key][1])]))
image_dict = {"image": image_array[0].tolist()}
response = app.test_client().post("/prediction/svm", json=json.dumps(image_dict))
assert "[200 OK]" in str(response)
response = app.test_client().post("/prediction/tree", json=json.dumps(image_dict))
assert "[200 OK]" in str(response)
response = app.test_client().post("/prediction/lr", json=json.dumps(image_dict))
assert "[200 OK]" in str(response)
# this assert is running for 10 times with different images
# assert int(json.loads(response.data)["prediction"]) == key


def test_lr_model_saved():
X_train, y_train, X_dev, y_dev = create_dummy_dataset()
h_params_grid = create_dummy_lr_hparams()
best_model_path, _, accuracy = utils.tune_hparams(
X_train, X_dev, y_train, y_dev, h_params_grid, "lr"
)
assert os.path.exists(best_model_path)
assert os.path.getsize(best_model_path) > 0
assert best_model_path.endswith(".joblib")
best_model = load(best_model_path)
assert best_model is not None
assert "LogisticRegression" in str(type(best_model))
assert best_model.get_params()["solver"] in best_model_path
assert accuracy == utils.predict_and_eval(best_model, X_dev, y_dev)
109 changes: 109 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
from sklearn.model_selection import train_test_split
from sklearn import svm, tree, datasets, metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import normalize
from joblib import dump
import itertools

"""
Common functions:
"""


# flatten the images
def preprocess_data(data):
n = len(data)
reshaped_data = data.reshape((n, -1))
return normalize(reshaped_data)


def split_data(X, y, test_size, random_state=1):
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, shuffle=True
)
return X_train, X_test, y_train, y_test


def split_train_dev_test(X, y, test_size, dev_size, random_state=1):
test_dev_size = test_size + dev_size
if test_dev_size >= 0.9:
raise ValueError(
"Total test and Dev data cannot be more than 90% of entire data"
)

X_train, X_test_dev, y_train, y_test_dev = train_test_split(
X, y, test_size=test_dev_size, random_state=random_state
)
X_test, X_dev, y_test, y_dev = train_test_split(
X_test_dev,
y_test_dev,
test_size=dev_size / test_dev_size,
random_state=random_state,
)

return X_train, X_dev, X_test, y_train, y_dev, y_test


def train_model(x, y, model_params, model_type="svm"):
if model_type == "svm":
clf = svm.SVC
elif model_type == "tree":
clf = tree.DecisionTreeClassifier
elif model_type == "lr":
clf = LogisticRegression
model = clf(**model_params)
model.fit(x, y)
return model


def read_digits():
digits = datasets.load_digits()
return digits.images, digits.target


def predict_and_eval(model, X_test, y_test):
predicted = model.predict(X_test)
# cm = metrics.confusion_matrix(y_test, predicted)
# print(f"Confusion matrix:\n{cm}")
# print(
# f"Classification report for classifier {model}:\n"
# f"{metrics.classification_report(y_test, predicted)}\n"
# )
return metrics.accuracy_score(y_test, predicted)


def get_combinations_with_keys(grid):
lists = grid.values()
keys = grid.keys()
combinations = list(itertools.product(*lists))
return [dict(zip(keys, combination)) for combination in combinations]


def tune_hparams(X_train, X_dev, y_train, y_dev, h_params_grid, model_type):
best_accuracy = -1
best_model = None
best_params = {}

for h_params in ParameterGrid(h_params_grid):
cur_model = train_model(X_train, y_train, h_params, model_type)
cur_accuracy = predict_and_eval(cur_model, X_dev, y_dev)
train_accuracy = predict_and_eval(cur_model, X_train, y_train)

if model_type == "lr":
solver = h_params.get("solver", None)
print(
f"model_type = {model_type} solver = {solver} train_acc={train_accuracy} dev_acc={cur_accuracy}" # noqa
)
model_path = f"./models/m22aie215_{model_type}_{solver}.joblib"
dump(cur_model, model_path)

if cur_accuracy > best_accuracy:
best_accuracy = cur_accuracy
best_params = h_params
best_model = cur_model
best_model_path = f'./models/best_model_{"_".join([f"{k}-{v}" for k, v in best_params.items()])}.joblib' # noqa

dump(best_model, best_model_path)

return best_model_path, best_params, best_accuracy