IBM-Cloud · m22aie237 · Oct 21, 2023 · Dec 2, 2023 · Dec 2, 2023 · Dec 2, 2023
diff --git a/Dockerfile b/Dockerfile
@@ -1,18 +1,13 @@
-FROM python:2-alpine
+from sklearn.model_selection import train_test_split
 
-COPY ./requirements.txt /app/requirements.txt
+def split_train_dev_test(X, y, test_size, dev_size):
+    # Split into train+dev and test sets first
+    X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=test_size)
 
-WORKDIR /app
+    # Compute actual dev size relative to the combined train+dev set
+    actual_dev_size = dev_size / (1 - test_size)
+
+    # Split the train+dev set into separate training and dev sets
+    X_train, X_dev, y_train, y_dev = train_test_split(X_temp, y_temp, test_size=actual_dev_size)
 
-RUN apk --update add python py-pip openssl ca-certificates py-openssl wget bash linux-headers
-RUN apk --update add --virtual build-dependencies libffi-dev openssl-dev python-dev py-pip build-base \
-  && pip install --upgrade pip \
-  && pip install --upgrade pipenv\
-  && pip install --upgrade -r /app/requirements.txt\
-  && apk del build-dependencies
-
-COPY . /app
-
-ENTRYPOINT [ "python" ]
-
-CMD [ "hello.py" ]
+    return X_train, X_dev, X_test, y_train, y_dev, y_test
diff --git a/README.md b/README.md
@@ -1,5 +1,6 @@
 # Getting Started with Python on IBM Cloud
 
+
 To get started, we'll take you through a sample Python Flask app, help you set up a development environment, deploy to IBM Cloud and add a Cloudant database.
 
 The following instructions are for deploying the application as a Cloud Foundry application. To deploy as a container to **IBM Cloud Kubernetes Service** instead, [see README-kubernetes.md](README-kubernetes.md)

diff --git a/ans2.py b/ans2.py
@@ -0,0 +1,20 @@
+
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import classification_report
+from joblib import dump
+solvers = ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
+for solver in solvers:
+    model = LogisticRegression(solver=solver)
+    model.fit(X_train, y_train)
+
+    # Evaluate the model
+    predictions = model.predict(X_test)
+    report = classification_report(y_test, predictions)
+    print(f"Performance with solver {solver}:\n{report}")
+
+    # Save the model
+    model_filename = f"{roll_no}_lr_{solver}.joblib"
+    dump(model, model_filename)
+
+
diff --git a/app.py b/app.py
@@ -0,0 +1,50 @@
+if file:
+        image = _read_image(Image.open(file))
+        model_path = "models/best_model_C-1_gamma-0.001.joblib"
+        model_path = "./model/best_model_C-1_gamma-10.joblib"
+        model = joblib.load(model_path)
+        prediction = model.predict(image)
+        return jsonify({"prediction": str(prediction[0])})
+    else:
+        return jsonify({"error": "Invalid file format"})
+
+
+@app.route("/prediction", methods=["POST"])
+def prediction():
+@app.route("/prediction/<model_type>", methods=["POST"])
+def prediction(model_type):
+    if model_type not in ["svm", "tree", "lr"]:
+        return jsonify({"error": "Invalid model type"})
+    else:
+        model = load_model(model_type)
+    data_json = request.json
+    if data_json:
+        data_dict = json.loads(data_json)
+        image = np.array([data_dict["image"]])
+        model_path = "models/best_model_C-1_gamma-0.001.joblib"
+        model = joblib.load(model_path)
+        # model_path = "models/best_model_C-1_gamma-0.001.joblib"
+        # model = joblib.load(model_path)
+        try:
+            prediction = model.predict(image)
+            return jsonify({"prediction": str(prediction[0])})
+@@ -123,7 +127,18 @@ def prediction():
+        return jsonify({"error": "Invalid data format"})
+
+
+def load_model(model_type="svm"):
+    if model_type == "svm":
+        model_path = "./models/best_model_C-1_gamma-10.joblib"
+    elif model_type == "tree":
+        model_path = "./models/best_model_max_depth-15.joblib"
+    elif model_type == "lr":
+        model_path = "./models/best_model_solver-lbfgs.joblib"
+    model = joblib.load(model_path)
+    return model
+
+
+if __name__ == "__main__":
+    print("server is running")
+    #check
+    # check
+    app.run(host="0.0.0.0", port=8000)
diff --git a/hello.py b/hello.py
@@ -86,3 +86,18 @@ def shutdown():
 
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=port, debug=True)
+
+
+
+
+
+# Calculate the total number of samples
+total_samples = len(train_data) + len(test_data) + len(dev_data)
+
+# Get the image dimensions (assuming all images have the same size)
+image_height, image_width = train_data[0].shape[:2]  # Assuming the shape is (height, width, channels)
+
+# Print statements
+print(f"Total number of samples in the dataset: {total_samples}")
+print(f"Size of the images in the dataset: {image_height}x{image_width}")
+
diff --git a/image_resize.py b/image_resize.py
@@ -0,0 +1,19 @@
+import numpy as np
+from skimage.transform import resize
+
+def resize_images(images, size):
+  """Resizes images to the given size.
+
+  Args:
+    images: A numpy array of images.
+    size: The target size of the images.
+
+  Returns:
+    A numpy array of resized images.
+  """
+
+  resized_images = np.zeros((images.shape[0], size[0], size[1], images.shape[3]))
+  for i in range(images.shape[0]):
+    resized_images[i] = resize(images[i], size, order=3)
+  return resized_images
+
diff --git a/test_utils.py b/test_utils.py
@@ -0,0 +1,139 @@
+import utils
+from joblib import load
+import os
+import json
+from app import app
+from sklearn import datasets
+import numpy as np
+
+
+def create_dummy_dataset():
+    X, y = utils.read_digits()
+    X_train = X[:100, :, :]
+    y_train = y[:100]
+    X_dev = X[:50, :, :]
+    y_dev = y[:50]
+    X_train = utils.preprocess_data(X_train)
+    X_dev = utils.preprocess_data(X_dev)
+    return X_train, y_train, X_dev, y_dev
+
+
+def create_dummy_hparams():
+    return {
+        "gamma": [0.001, 0.01, 0.1, 1, 10, 100],
+        "C": [0.1, 1, 2, 5, 10],
+    }
+
+
+def create_dummy_lr_hparams():
+    return {
+        "solver": ["newton-cg"],
+    }
+
+
+def test_hparams_combinations():
+    # a test case to check all possible combinations of hyper parameters
+    h_params_grid = create_dummy_hparams()
+    h_param_combinations = utils.get_combinations_with_keys(h_params_grid)
+
+    assert len(h_param_combinations) == len(h_params_grid["gamma"]) * len(
+        h_params_grid["C"]
+    )
+
+
+def test_hparams_combinations_values():
+    # a test case to check all possible combinations of hyper parameters values
+    h_params_grid = create_dummy_hparams()
+    h_param_combinations = utils.get_combinations_with_keys(h_params_grid)
+
+    assert len(h_param_combinations) == len(h_params_grid["gamma"]) * len(
+        h_params_grid["C"]
+    )
+    expected_parma_combo_1 = {"gamma": 0.001, "C": 1}
+    expected_parma_combo_2 = {"gamma": 0.01, "C": 1}
+
+    assert expected_parma_combo_1 in h_param_combinations
+    assert expected_parma_combo_2 in h_param_combinations
+
+
+def test_data_splitting():
+    X, y = utils.read_digits()
+    X = X[:100, :, :]
+    y = y[:100]
+    test_size = 0.1
+    dev_size = 0.6
+    train_size = 1 - (test_size + dev_size)
+    print(train_size)
+    (
+        X_train,
+        X_dev,
+        X_test,
+        y_train,
+        y_dev,
+        y_test,
+    ) = utils.split_train_dev_test(X, y, test_size=test_size, dev_size=dev_size)
+    print(f"{len(X_train)},{len(X_dev)},{len(X_test)}")
+    assert len(X_train) + len(X_dev) + len(X_test) == 100
+    assert len(y_train) + len(y_dev) + len(y_test) == 100
+    assert 29 <= (len(X_train)) <= 31
+    assert 29 <= (len(y_train)) <= 31
+    assert 59 <= (len(X_dev)) <= 61
+    assert 59 <= (len(y_dev)) <= 61
+    assert 9 <= (len(X_test)) <= 11
+    assert 9 <= (len(y_test)) <= 11
+
+
+def test_is_model_saved():
+    X_train, y_train, X_dev, y_dev = create_dummy_dataset()
+    h_params_grid = create_dummy_hparams()
+    best_model_path, _, accuracy = utils.tune_hparams(
+        X_train, X_dev, y_train, y_dev, h_params_grid, "svm"
+    )
+    assert os.path.exists(best_model_path)
+    assert os.path.getsize(best_model_path) > 0
+    assert best_model_path.endswith(".joblib")
+    best_model = load(best_model_path)
+    assert best_model is not None
+    assert accuracy == utils.predict_and_eval(best_model, X_dev, y_dev)
+
+
+def test_get_root():
+    response = app.test_client().get("/")
+    assert response.status_code == 200
+
+
+def test_prediction():
+    digits = datasets.load_digits()
+
+    image_digits = {i: [] for i in range(10)}
+
+    for image, label in zip(digits.images, digits.target):
+        image_digits[label].append(image)
+        assert len(image_digits) == 10
+    for key in image_digits.keys():
+        image_array = utils.preprocess_data(np.array([(image_digits[key][1])]))
+        image_dict = {"image": image_array[0].tolist()}
+        response = app.test_client().post("/prediction/svm", json=json.dumps(image_dict))
+        assert "[200 OK]" in str(response)
+        response = app.test_client().post("/prediction/tree", json=json.dumps(image_dict))
+        assert "[200 OK]" in str(response)
+        response = app.test_client().post("/prediction/lr", json=json.dumps(image_dict))
+        assert "[200 OK]" in str(response)
+        # this assert is running for 10 times with different images
+        # assert int(json.loads(response.data)["prediction"]) == key
+
+
+def test_lr_model_saved():
+    X_train, y_train, X_dev, y_dev = create_dummy_dataset()
+    h_params_grid = create_dummy_lr_hparams()
+    best_model_path, _, accuracy = utils.tune_hparams(
+        X_train, X_dev, y_train, y_dev, h_params_grid, "lr"
+    )
+    assert os.path.exists(best_model_path)
+    assert os.path.getsize(best_model_path) > 0
+    assert best_model_path.endswith(".joblib")
+    best_model = load(best_model_path)
+    assert best_model is not None
+    assert "LogisticRegression" in str(type(best_model))
+    assert best_model.get_params()["solver"] in best_model_path
+    assert accuracy == utils.predict_and_eval(best_model, X_dev, y_dev)
diff --git a/utils.py b/utils.py
@@ -0,0 +1,109 @@
+from sklearn.model_selection import train_test_split
+from sklearn import svm, tree, datasets, metrics
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import ParameterGrid
+from sklearn.preprocessing import normalize
+from joblib import dump
+import itertools
+
+""" 
+Common functions:
+"""
+
+
+# flatten the images
+def preprocess_data(data):
+    n = len(data)
+    reshaped_data = data.reshape((n, -1))
+    return normalize(reshaped_data)
+
+
+def split_data(X, y, test_size, random_state=1):
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=test_size, shuffle=True
+    )
+    return X_train, X_test, y_train, y_test
+
+
+def split_train_dev_test(X, y, test_size, dev_size, random_state=1):
+    test_dev_size = test_size + dev_size
+    if test_dev_size >= 0.9:
+        raise ValueError(
+            "Total test and Dev data cannot be more than 90% of entire data"
+        )
+
+    X_train, X_test_dev, y_train, y_test_dev = train_test_split(
+        X, y, test_size=test_dev_size, random_state=random_state
+    )
+    X_test, X_dev, y_test, y_dev = train_test_split(
+        X_test_dev,
+        y_test_dev,
+        test_size=dev_size / test_dev_size,
+        random_state=random_state,
+    )
+
+    return X_train, X_dev, X_test, y_train, y_dev, y_test
+
+
+def train_model(x, y, model_params, model_type="svm"):
+    if model_type == "svm":
+        clf = svm.SVC
+    elif model_type == "tree":
+        clf = tree.DecisionTreeClassifier
+    elif model_type == "lr":
+        clf = LogisticRegression
+    model = clf(**model_params)
+    model.fit(x, y)
+    return model
+
+
+def read_digits():
+    digits = datasets.load_digits()
+    return digits.images, digits.target
+
+
+def predict_and_eval(model, X_test, y_test):
+    predicted = model.predict(X_test)
+    # cm = metrics.confusion_matrix(y_test, predicted)
+    # print(f"Confusion matrix:\n{cm}")
+    # print(
+    #     f"Classification report for classifier {model}:\n"
+    #     f"{metrics.classification_report(y_test, predicted)}\n"
+    # )
+    return metrics.accuracy_score(y_test, predicted)
+
+
+def get_combinations_with_keys(grid):
+    lists = grid.values()
+    keys = grid.keys()
+    combinations = list(itertools.product(*lists))
+    return [dict(zip(keys, combination)) for combination in combinations]
+
+
+def tune_hparams(X_train, X_dev, y_train, y_dev, h_params_grid, model_type):
+    best_accuracy = -1
+    best_model = None
+    best_params = {}
+
+    for h_params in ParameterGrid(h_params_grid):
+        cur_model = train_model(X_train, y_train, h_params, model_type)
+        cur_accuracy = predict_and_eval(cur_model, X_dev, y_dev)
+        train_accuracy = predict_and_eval(cur_model, X_train, y_train)
+
+        if model_type == "lr":
+            solver = h_params.get("solver", None)
+            print(
+                f"model_type = {model_type} solver = {solver} train_acc={train_accuracy} dev_acc={cur_accuracy}"  # noqa
+            )
+            model_path = f"./models/m22aie215_{model_type}_{solver}.joblib"
+            dump(cur_model, model_path)
+
+        if cur_accuracy > best_accuracy:
+            best_accuracy = cur_accuracy
+            best_params = h_params
+            best_model = cur_model
+    best_model_path = f'./models/best_model_{"_".join([f"{k}-{v}" for k, v in best_params.items()])}.joblib'  # noqa
+
+    dump(best_model, best_model_path)
+
+    return best_model_path, best_params, best_accuracy