From 9d3406f68616484bca8fdd296cf8e942bd095a7c Mon Sep 17 00:00:00 2001 From: azuur <44506730+azuur@users.noreply.github.com> Date: Tue, 13 Feb 2024 15:04:44 -0500 Subject: [PATCH 1/3] Update README.md --- README.md | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 430cd90..72d2936 100644 --- a/README.md +++ b/README.md @@ -2,23 +2,35 @@ Simple ML pipeline repo for experimenting with CI/CD / DevOps / MLOps. +## The idea + +This repo contains code to train, evaluate, and serve a simple machine learning model. The objective of this project is to work on my MLOps and ML engineering skills. + +Some ideas I am implementing in this repo are: + +- Do things as simply but professionally as possible. A simple but working solution is better than a sophisticated solution that never runs. +- [Oneflow](https://www.endoflineblog.com/oneflow-a-git-branching-model-and-workflow) as the Git branching strategy. +- Cleanish architecture. For now, this means that code is separated between "core" and "non-core" tasks and data structures, and "core" code doesn't depend on "non-core" code. +- One repo and one docker image for train, eval, and serve. IMO, this makes sharing functionality across tasks easier and artifact versioning simpler. (But I'm interested in hearing about drawbacks, too.) +- Use Python packaging. The project is a Python package with [poetry](https://python-poetry.org/) as the build backend. +- CI is done using [pre-commit](https://pre-commit.com/) and GitHub actions (since we're in GitHub). +- CD should be done depending on how the project is to be deployed. Currently, I'm experimenting with AWS for deployment, so I also use it for CD. + +Since the point of this project is _not_ to sharpen my data anaylsis/science skills, the actual data for the project is completely simulated. Maybe later I will try to modify this in order to actually solve a useful problem. + ## To do -- ~~Use Python 3.11 in CI github action~~ -- ~~make pipelines functions~~ -- ~~add loggers to stuff~~ -- ~~add local deployment code...~~ -- ~~add versioning to training... in deployment?~~ -- ~~add eval pipeline, model comparison~~ -- ~~add "best model" mark. add "get_best_model"~~ -- ~~add Dockerfile~~ -- add real prediction logging func -- add simple demo unit tests -- add db conn / func to save inference cases (local deployment) -- add build script to push to ECR (AWS deployment) -- add rest of AWS deployment (using S3, EC2, AWS CodePipeline) +- Add section detailing v1 CD + deployment on AWS (with CodeBuild, ECR, Fargate ECS tasks and services, and ELB). +- Create deployment stack using IaC tool (could be AWS CloudFormation). +- Add real prediction logging func +- Add simple demo unit tests +- Add db conn / func to save inference cases +- Add build script to push to ECR (AWS deployment) # Commands to remember + +This is a bit inelegant. Sorry. + - python ml_pipelines/deployment/local/train.py - python ml_pipelines/deployment/local/eval.py - python ml_pipelines/deployment/local/serve.py From 2488d92da41d181e652eb05d5f026c26ad372dc0 Mon Sep 17 00:00:00 2001 From: Adrian Zuur Date: Tue, 13 Feb 2024 16:24:37 -0500 Subject: [PATCH 2/3] update readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 72d2936..876ff37 100644 --- a/README.md +++ b/README.md @@ -8,11 +8,11 @@ This repo contains code to train, evaluate, and serve a simple machine learning Some ideas I am implementing in this repo are: -- Do things as simply but professionally as possible. A simple but working solution is better than a sophisticated solution that never runs. +- Do things as simply but professionally as possible. A simple working solution is better than a sophisticated solution that isn't deployed. ([Agile is the only thing that works](https://www.youtube.com/watch?v=9K20e7jlQPA).) - [Oneflow](https://www.endoflineblog.com/oneflow-a-git-branching-model-and-workflow) as the Git branching strategy. - Cleanish architecture. For now, this means that code is separated between "core" and "non-core" tasks and data structures, and "core" code doesn't depend on "non-core" code. - One repo and one docker image for train, eval, and serve. IMO, this makes sharing functionality across tasks easier and artifact versioning simpler. (But I'm interested in hearing about drawbacks, too.) -- Use Python packaging. The project is a Python package with [poetry](https://python-poetry.org/) as the build backend. +- Use Python standard tooling to make collaboration easier. In particular, the project is a Python package with [poetry](https://python-poetry.org/) as the build backend. - CI is done using [pre-commit](https://pre-commit.com/) and GitHub actions (since we're in GitHub). - CD should be done depending on how the project is to be deployed. Currently, I'm experimenting with AWS for deployment, so I also use it for CD. From e6d7d0d0b46e39802615521eaf0a101db72fd986 Mon Sep 17 00:00:00 2001 From: Adrian Zuur Date: Tue, 13 Feb 2024 16:34:26 -0500 Subject: [PATCH 3/3] Rename logic to core --- ml_pipelines/{logic => core}/__init__.py | 0 ml_pipelines/{logic => core}/common/__init__.py | 0 ml_pipelines/{logic => core}/common/dgp.py | 0 ml_pipelines/{logic => core}/common/feature_eng.py | 0 ml_pipelines/{logic => core}/common/model.py | 0 ml_pipelines/{logic => core}/eval/__init__.py | 0 ml_pipelines/{logic => core}/eval/eval.py | 0 ml_pipelines/{logic => core}/serve/__init__.py | 0 ml_pipelines/{logic => core}/serve/serve.py | 4 ++-- ml_pipelines/{logic => core}/train/__init__.py | 0 ml_pipelines/{logic => core}/train/train.py | 0 ml_pipelines/deployment/aws/io.py | 4 ++-- ml_pipelines/deployment/common/serve.py | 2 +- ml_pipelines/deployment/local/io.py | 4 ++-- ml_pipelines/pipeline/data_gen_pipeline.py | 2 +- ml_pipelines/pipeline/eval_pipeline.py | 6 +++--- ml_pipelines/pipeline/train_pipeline.py | 4 ++-- 17 files changed, 13 insertions(+), 13 deletions(-) rename ml_pipelines/{logic => core}/__init__.py (100%) rename ml_pipelines/{logic => core}/common/__init__.py (100%) rename ml_pipelines/{logic => core}/common/dgp.py (100%) rename ml_pipelines/{logic => core}/common/feature_eng.py (100%) rename ml_pipelines/{logic => core}/common/model.py (100%) rename ml_pipelines/{logic => core}/eval/__init__.py (100%) rename ml_pipelines/{logic => core}/eval/eval.py (100%) rename ml_pipelines/{logic => core}/serve/__init__.py (100%) rename ml_pipelines/{logic => core}/serve/serve.py (95%) rename ml_pipelines/{logic => core}/train/__init__.py (100%) rename ml_pipelines/{logic => core}/train/train.py (100%) diff --git a/ml_pipelines/logic/__init__.py b/ml_pipelines/core/__init__.py similarity index 100% rename from ml_pipelines/logic/__init__.py rename to ml_pipelines/core/__init__.py diff --git a/ml_pipelines/logic/common/__init__.py b/ml_pipelines/core/common/__init__.py similarity index 100% rename from ml_pipelines/logic/common/__init__.py rename to ml_pipelines/core/common/__init__.py diff --git a/ml_pipelines/logic/common/dgp.py b/ml_pipelines/core/common/dgp.py similarity index 100% rename from ml_pipelines/logic/common/dgp.py rename to ml_pipelines/core/common/dgp.py diff --git a/ml_pipelines/logic/common/feature_eng.py b/ml_pipelines/core/common/feature_eng.py similarity index 100% rename from ml_pipelines/logic/common/feature_eng.py rename to ml_pipelines/core/common/feature_eng.py diff --git a/ml_pipelines/logic/common/model.py b/ml_pipelines/core/common/model.py similarity index 100% rename from ml_pipelines/logic/common/model.py rename to ml_pipelines/core/common/model.py diff --git a/ml_pipelines/logic/eval/__init__.py b/ml_pipelines/core/eval/__init__.py similarity index 100% rename from ml_pipelines/logic/eval/__init__.py rename to ml_pipelines/core/eval/__init__.py diff --git a/ml_pipelines/logic/eval/eval.py b/ml_pipelines/core/eval/eval.py similarity index 100% rename from ml_pipelines/logic/eval/eval.py rename to ml_pipelines/core/eval/eval.py diff --git a/ml_pipelines/logic/serve/__init__.py b/ml_pipelines/core/serve/__init__.py similarity index 100% rename from ml_pipelines/logic/serve/__init__.py rename to ml_pipelines/core/serve/__init__.py diff --git a/ml_pipelines/logic/serve/serve.py b/ml_pipelines/core/serve/serve.py similarity index 95% rename from ml_pipelines/logic/serve/serve.py rename to ml_pipelines/core/serve/serve.py index 8d0481e..9595c76 100644 --- a/ml_pipelines/logic/serve/serve.py +++ b/ml_pipelines/core/serve/serve.py @@ -6,11 +6,11 @@ from pydantic import BaseModel, Field from sklearn.linear_model import LogisticRegression -from ml_pipelines.logic.common.feature_eng import ( +from ml_pipelines.core.common.feature_eng import ( FeatureEngineeringParams, transform_features, ) -from ml_pipelines.logic.common.model import predict +from ml_pipelines.core.common.model import predict class Point(BaseModel): diff --git a/ml_pipelines/logic/train/__init__.py b/ml_pipelines/core/train/__init__.py similarity index 100% rename from ml_pipelines/logic/train/__init__.py rename to ml_pipelines/core/train/__init__.py diff --git a/ml_pipelines/logic/train/train.py b/ml_pipelines/core/train/train.py similarity index 100% rename from ml_pipelines/logic/train/train.py rename to ml_pipelines/core/train/train.py diff --git a/ml_pipelines/deployment/aws/io.py b/ml_pipelines/deployment/aws/io.py index 9777f10..de2f13c 100644 --- a/ml_pipelines/deployment/aws/io.py +++ b/ml_pipelines/deployment/aws/io.py @@ -10,8 +10,8 @@ from matplotlib.figure import Figure from sklearn.linear_model import LogisticRegression -from ml_pipelines.logic.common.feature_eng import FeatureEngineeringParams -from ml_pipelines.logic.serve.serve import Point +from ml_pipelines.core.common.feature_eng import FeatureEngineeringParams +from ml_pipelines.core.serve.serve import Point from ml_pipelines.pipeline.train_pipeline import TrainArtifacts diff --git a/ml_pipelines/deployment/common/serve.py b/ml_pipelines/deployment/common/serve.py index 01f9fb2..9f63904 100644 --- a/ml_pipelines/deployment/common/serve.py +++ b/ml_pipelines/deployment/common/serve.py @@ -3,7 +3,7 @@ import uvicorn -from ml_pipelines.logic.serve.serve import PredictionLoggingFunc, create_fastapi_app +from ml_pipelines.core.serve.serve import PredictionLoggingFunc, create_fastapi_app from ml_pipelines.pipeline.train_pipeline import TrainArtifacts diff --git a/ml_pipelines/deployment/local/io.py b/ml_pipelines/deployment/local/io.py index 840221d..b60a07c 100644 --- a/ml_pipelines/deployment/local/io.py +++ b/ml_pipelines/deployment/local/io.py @@ -8,8 +8,8 @@ from matplotlib.figure import Figure from sklearn.linear_model import LogisticRegression -from ml_pipelines.logic.common.feature_eng import FeatureEngineeringParams -from ml_pipelines.logic.serve.serve import Point +from ml_pipelines.core.common.feature_eng import FeatureEngineeringParams +from ml_pipelines.core.serve.serve import Point from ml_pipelines.pipeline.train_pipeline import TrainArtifacts diff --git a/ml_pipelines/pipeline/data_gen_pipeline.py b/ml_pipelines/pipeline/data_gen_pipeline.py index 8a3cf45..220c1c5 100644 --- a/ml_pipelines/pipeline/data_gen_pipeline.py +++ b/ml_pipelines/pipeline/data_gen_pipeline.py @@ -1,4 +1,4 @@ -from ml_pipelines.logic.common.dgp import generate_raw_data +from ml_pipelines.core.common.dgp import generate_raw_data data = generate_raw_data(10_000, 813) data.to_csv("raw_data.csv", index=False) diff --git a/ml_pipelines/pipeline/eval_pipeline.py b/ml_pipelines/pipeline/eval_pipeline.py index 8028868..7d323db 100644 --- a/ml_pipelines/pipeline/eval_pipeline.py +++ b/ml_pipelines/pipeline/eval_pipeline.py @@ -4,12 +4,12 @@ import pandas as pd from sklearn.linear_model import LogisticRegression -from ml_pipelines.logic.common.feature_eng import ( +from ml_pipelines.core.common.feature_eng import ( FeatureEngineeringParams, transform_features, ) -from ml_pipelines.logic.common.model import predict -from ml_pipelines.logic.eval.eval import ( +from ml_pipelines.core.common.model import predict +from ml_pipelines.core.eval.eval import ( calculate_metrics, make_calibration_plot, make_roc_plot, diff --git a/ml_pipelines/pipeline/train_pipeline.py b/ml_pipelines/pipeline/train_pipeline.py index cdbe531..f7c9d72 100644 --- a/ml_pipelines/pipeline/train_pipeline.py +++ b/ml_pipelines/pipeline/train_pipeline.py @@ -4,12 +4,12 @@ import pandas as pd from sklearn.linear_model import LogisticRegression -from ml_pipelines.logic.common.feature_eng import ( +from ml_pipelines.core.common.feature_eng import ( FeatureEngineeringParams, fit_feature_transform, transform_features, ) -from ml_pipelines.logic.train.train import split_data, train_model +from ml_pipelines.core.train.train import split_data, train_model class TrainArtifacts(TypedDict):