diff --git a/Dockerfile b/Dockerfile index 9ee9642..a785e19 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,18 +1,13 @@ -FROM python:2-alpine +from sklearn.model_selection import train_test_split -COPY ./requirements.txt /app/requirements.txt +def split_train_dev_test(X, y, test_size, dev_size): + # Split into train+dev and test sets first + X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=test_size) -WORKDIR /app + # Compute actual dev size relative to the combined train+dev set + actual_dev_size = dev_size / (1 - test_size) + + # Split the train+dev set into separate training and dev sets + X_train, X_dev, y_train, y_dev = train_test_split(X_temp, y_temp, test_size=actual_dev_size) -RUN apk --update add python py-pip openssl ca-certificates py-openssl wget bash linux-headers -RUN apk --update add --virtual build-dependencies libffi-dev openssl-dev python-dev py-pip build-base \ - && pip install --upgrade pip \ - && pip install --upgrade pipenv\ - && pip install --upgrade -r /app/requirements.txt\ - && apk del build-dependencies - -COPY . /app - -ENTRYPOINT [ "python" ] - -CMD [ "hello.py" ] \ No newline at end of file + return X_train, X_dev, X_test, y_train, y_dev, y_test diff --git a/README.md b/README.md index 45a9b69..9695c8d 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # Getting Started with Python on IBM Cloud + To get started, we'll take you through a sample Python Flask app, help you set up a development environment, deploy to IBM Cloud and add a Cloudant database. The following instructions are for deploying the application as a Cloud Foundry application. To deploy as a container to **IBM Cloud Kubernetes Service** instead, [see README-kubernetes.md](README-kubernetes.md) diff --git a/ans2.py b/ans2.py new file mode 100644 index 0000000..725b317 --- /dev/null +++ b/ans2.py @@ -0,0 +1,20 @@ + +import numpy as np +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import classification_report +from joblib import dump +solvers = ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'] +for solver in solvers: + model = LogisticRegression(solver=solver) + model.fit(X_train, y_train) + + # Evaluate the model + predictions = model.predict(X_test) + report = classification_report(y_test, predictions) + print(f"Performance with solver {solver}:\n{report}") + + # Save the model + model_filename = f"{roll_no}_lr_{solver}.joblib" + dump(model, model_filename) + + diff --git a/app.py b/app.py new file mode 100644 index 0000000..cc95c25 --- /dev/null +++ b/app.py @@ -0,0 +1,50 @@ +if file: + image = _read_image(Image.open(file)) + model_path = "models/best_model_C-1_gamma-0.001.joblib" + model_path = "./model/best_model_C-1_gamma-10.joblib" + model = joblib.load(model_path) + prediction = model.predict(image) + return jsonify({"prediction": str(prediction[0])}) + else: + return jsonify({"error": "Invalid file format"}) + + +@app.route("/prediction", methods=["POST"]) +def prediction(): +@app.route("/prediction/", methods=["POST"]) +def prediction(model_type): + if model_type not in ["svm", "tree", "lr"]: + return jsonify({"error": "Invalid model type"}) + else: + model = load_model(model_type) + data_json = request.json + if data_json: + data_dict = json.loads(data_json) + image = np.array([data_dict["image"]]) + model_path = "models/best_model_C-1_gamma-0.001.joblib" + model = joblib.load(model_path) + # model_path = "models/best_model_C-1_gamma-0.001.joblib" + # model = joblib.load(model_path) + try: + prediction = model.predict(image) + return jsonify({"prediction": str(prediction[0])}) +@@ -123,7 +127,18 @@ def prediction(): + return jsonify({"error": "Invalid data format"}) + + +def load_model(model_type="svm"): + if model_type == "svm": + model_path = "./models/best_model_C-1_gamma-10.joblib" + elif model_type == "tree": + model_path = "./models/best_model_max_depth-15.joblib" + elif model_type == "lr": + model_path = "./models/best_model_solver-lbfgs.joblib" + model = joblib.load(model_path) + return model + + +if __name__ == "__main__": + print("server is running") + #check + # check + app.run(host="0.0.0.0", port=8000) diff --git a/hello.py b/hello.py index 9218339..67546ee 100644 --- a/hello.py +++ b/hello.py @@ -86,3 +86,18 @@ def shutdown(): if __name__ == '__main__': app.run(host='0.0.0.0', port=port, debug=True) + + + + + +# Calculate the total number of samples +total_samples = len(train_data) + len(test_data) + len(dev_data) + +# Get the image dimensions (assuming all images have the same size) +image_height, image_width = train_data[0].shape[:2] # Assuming the shape is (height, width, channels) + +# Print statements +print(f"Total number of samples in the dataset: {total_samples}") +print(f"Size of the images in the dataset: {image_height}x{image_width}") + diff --git a/image_resize.py b/image_resize.py new file mode 100644 index 0000000..9316504 --- /dev/null +++ b/image_resize.py @@ -0,0 +1,19 @@ +import numpy as np +from skimage.transform import resize + +def resize_images(images, size): + """Resizes images to the given size. + + Args: + images: A numpy array of images. + size: The target size of the images. + + Returns: + A numpy array of resized images. + """ + + resized_images = np.zeros((images.shape[0], size[0], size[1], images.shape[3])) + for i in range(images.shape[0]): + resized_images[i] = resize(images[i], size, order=3) + return resized_images + diff --git a/test_utils.py b/test_utils.py new file mode 100644 index 0000000..4ada790 --- /dev/null +++ b/test_utils.py @@ -0,0 +1,139 @@ +import utils +from joblib import load +import os +import json +from app import app +from sklearn import datasets +import numpy as np + + +def create_dummy_dataset(): + X, y = utils.read_digits() + X_train = X[:100, :, :] + y_train = y[:100] + X_dev = X[:50, :, :] + y_dev = y[:50] + X_train = utils.preprocess_data(X_train) + X_dev = utils.preprocess_data(X_dev) + return X_train, y_train, X_dev, y_dev + + +def create_dummy_hparams(): + return { + "gamma": [0.001, 0.01, 0.1, 1, 10, 100], + "C": [0.1, 1, 2, 5, 10], + } + + +def create_dummy_lr_hparams(): + return { + "solver": ["newton-cg"], + } + + +def test_hparams_combinations(): + # a test case to check all possible combinations of hyper parameters + h_params_grid = create_dummy_hparams() + h_param_combinations = utils.get_combinations_with_keys(h_params_grid) + + assert len(h_param_combinations) == len(h_params_grid["gamma"]) * len( + h_params_grid["C"] + ) + + +def test_hparams_combinations_values(): + # a test case to check all possible combinations of hyper parameters values + h_params_grid = create_dummy_hparams() + h_param_combinations = utils.get_combinations_with_keys(h_params_grid) + + assert len(h_param_combinations) == len(h_params_grid["gamma"]) * len( + h_params_grid["C"] + ) + expected_parma_combo_1 = {"gamma": 0.001, "C": 1} + expected_parma_combo_2 = {"gamma": 0.01, "C": 1} + + assert expected_parma_combo_1 in h_param_combinations + assert expected_parma_combo_2 in h_param_combinations + + +def test_data_splitting(): + X, y = utils.read_digits() + X = X[:100, :, :] + y = y[:100] + test_size = 0.1 + dev_size = 0.6 + train_size = 1 - (test_size + dev_size) + print(train_size) + ( + X_train, + X_dev, + X_test, + y_train, + y_dev, + y_test, + ) = utils.split_train_dev_test(X, y, test_size=test_size, dev_size=dev_size) + print(f"{len(X_train)},{len(X_dev)},{len(X_test)}") + assert len(X_train) + len(X_dev) + len(X_test) == 100 + assert len(y_train) + len(y_dev) + len(y_test) == 100 + assert 29 <= (len(X_train)) <= 31 + assert 29 <= (len(y_train)) <= 31 + assert 59 <= (len(X_dev)) <= 61 + assert 59 <= (len(y_dev)) <= 61 + assert 9 <= (len(X_test)) <= 11 + assert 9 <= (len(y_test)) <= 11 + + +def test_is_model_saved(): + X_train, y_train, X_dev, y_dev = create_dummy_dataset() + h_params_grid = create_dummy_hparams() + best_model_path, _, accuracy = utils.tune_hparams( + X_train, X_dev, y_train, y_dev, h_params_grid, "svm" + ) + assert os.path.exists(best_model_path) + assert os.path.getsize(best_model_path) > 0 + assert best_model_path.endswith(".joblib") + best_model = load(best_model_path) + assert best_model is not None + assert accuracy == utils.predict_and_eval(best_model, X_dev, y_dev) + + +def test_get_root(): + response = app.test_client().get("/") + assert response.status_code == 200 + + +def test_prediction(): + digits = datasets.load_digits() + + image_digits = {i: [] for i in range(10)} + + for image, label in zip(digits.images, digits.target): + image_digits[label].append(image) + assert len(image_digits) == 10 + for key in image_digits.keys(): + image_array = utils.preprocess_data(np.array([(image_digits[key][1])])) + image_dict = {"image": image_array[0].tolist()} + response = app.test_client().post("/prediction/svm", json=json.dumps(image_dict)) + assert "[200 OK]" in str(response) + response = app.test_client().post("/prediction/tree", json=json.dumps(image_dict)) + assert "[200 OK]" in str(response) + response = app.test_client().post("/prediction/lr", json=json.dumps(image_dict)) + assert "[200 OK]" in str(response) + # this assert is running for 10 times with different images + # assert int(json.loads(response.data)["prediction"]) == key + + +def test_lr_model_saved(): + X_train, y_train, X_dev, y_dev = create_dummy_dataset() + h_params_grid = create_dummy_lr_hparams() + best_model_path, _, accuracy = utils.tune_hparams( + X_train, X_dev, y_train, y_dev, h_params_grid, "lr" + ) + assert os.path.exists(best_model_path) + assert os.path.getsize(best_model_path) > 0 + assert best_model_path.endswith(".joblib") + best_model = load(best_model_path) + assert best_model is not None + assert "LogisticRegression" in str(type(best_model)) + assert best_model.get_params()["solver"] in best_model_path + assert accuracy == utils.predict_and_eval(best_model, X_dev, y_dev) diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..30a42eb --- /dev/null +++ b/utils.py @@ -0,0 +1,109 @@ +from sklearn.model_selection import train_test_split +from sklearn import svm, tree, datasets, metrics +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import ParameterGrid +from sklearn.preprocessing import normalize +from joblib import dump +import itertools + +""" +Common functions: +""" + + +# flatten the images +def preprocess_data(data): + n = len(data) + reshaped_data = data.reshape((n, -1)) + return normalize(reshaped_data) + + +def split_data(X, y, test_size, random_state=1): + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=test_size, shuffle=True + ) + return X_train, X_test, y_train, y_test + + +def split_train_dev_test(X, y, test_size, dev_size, random_state=1): + test_dev_size = test_size + dev_size + if test_dev_size >= 0.9: + raise ValueError( + "Total test and Dev data cannot be more than 90% of entire data" + ) + + X_train, X_test_dev, y_train, y_test_dev = train_test_split( + X, y, test_size=test_dev_size, random_state=random_state + ) + X_test, X_dev, y_test, y_dev = train_test_split( + X_test_dev, + y_test_dev, + test_size=dev_size / test_dev_size, + random_state=random_state, + ) + + return X_train, X_dev, X_test, y_train, y_dev, y_test + + +def train_model(x, y, model_params, model_type="svm"): + if model_type == "svm": + clf = svm.SVC + elif model_type == "tree": + clf = tree.DecisionTreeClassifier + elif model_type == "lr": + clf = LogisticRegression + model = clf(**model_params) + model.fit(x, y) + return model + + +def read_digits(): + digits = datasets.load_digits() + return digits.images, digits.target + + +def predict_and_eval(model, X_test, y_test): + predicted = model.predict(X_test) + # cm = metrics.confusion_matrix(y_test, predicted) + # print(f"Confusion matrix:\n{cm}") + # print( + # f"Classification report for classifier {model}:\n" + # f"{metrics.classification_report(y_test, predicted)}\n" + # ) + return metrics.accuracy_score(y_test, predicted) + + +def get_combinations_with_keys(grid): + lists = grid.values() + keys = grid.keys() + combinations = list(itertools.product(*lists)) + return [dict(zip(keys, combination)) for combination in combinations] + + +def tune_hparams(X_train, X_dev, y_train, y_dev, h_params_grid, model_type): + best_accuracy = -1 + best_model = None + best_params = {} + + for h_params in ParameterGrid(h_params_grid): + cur_model = train_model(X_train, y_train, h_params, model_type) + cur_accuracy = predict_and_eval(cur_model, X_dev, y_dev) + train_accuracy = predict_and_eval(cur_model, X_train, y_train) + + if model_type == "lr": + solver = h_params.get("solver", None) + print( + f"model_type = {model_type} solver = {solver} train_acc={train_accuracy} dev_acc={cur_accuracy}" # noqa + ) + model_path = f"./models/m22aie215_{model_type}_{solver}.joblib" + dump(cur_model, model_path) + + if cur_accuracy > best_accuracy: + best_accuracy = cur_accuracy + best_params = h_params + best_model = cur_model + best_model_path = f'./models/best_model_{"_".join([f"{k}-{v}" for k, v in best_params.items()])}.joblib' # noqa + + dump(best_model, best_model_path) + + return best_model_path, best_params, best_accuracy