Skip to content

Commit

Permalink
Feature/dockerfile (#10)
Browse files Browse the repository at this point in the history
* add stuff

* add more stuff

* update eval, deps

* update readme
  • Loading branch information
azuur authored Jan 25, 2024
1 parent 50e41da commit 26a50f7
Show file tree
Hide file tree
Showing 11 changed files with 883 additions and 100 deletions.
163 changes: 163 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
# IDE stuff
.vscode/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
2 changes: 2 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
RAW_DATA_ROOT_PATH=/path/to/dir
TRAIN_ARTIFACTS_ROOT_PATH=/path/to/dir
49 changes: 49 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
FROM python:3.11-slim as build

# install poetry - respects $POETRY_VERSION & $POETRY_HOME
# The --mount will mount the buildx cache directory to where
# Poetry and Pip store their cache so that they can re-use it

ENV PYTHONUNBUFFERED=1 \
# pip
PIP_DISABLE_PIP_VERSION_CHECK=on \
PIP_DEFAULT_TIMEOUT=100 \
POETRY_HOME="/opt/poetry" \
POETRY_NO_INTERACTION=1 \
POETRY_VIRTUALENVS_CREATE=false \
PATH="$POETRY_HOME/bin:$PATH"


RUN apt-get update && \
apt-get install -y \
apt-transport-https \
gnupg \
ca-certificates \
build-essential \
git \
nano \
curl

RUN curl -sSL https://install.python-poetry.org | python3 -


# used to init dependencies
COPY poetry.lock pyproject.toml README.md ./
COPY ml_pipelines ./ml_pipelines
RUN PATH=$POETRY_HOME/bin:$PATH poetry build

# # # Stage 2: Production stage
FROM python:3.11-slim

ARG UID=1000
ARG GID=1000
WORKDIR /package

RUN groupadd -g "${GID}" python \
&& useradd --create-home --no-log-init -u "${UID}" -g "${GID}" python \
&& chown python:python -R /package

COPY --from=build /dist/*.whl /package/dist/.
RUN pip install /package/dist/*.whl

USER python
19 changes: 16 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,22 @@ Simple ML pipeline repo for experimenting with CI/CD / DevOps / MLOps.
~~- add loggers to stuff~~
~~- add local deployment code...~~
~~- add versioning to training... in deployment?~~
- add eval pipeline, model comparison
- add "best model" mark. add "get_best_model"
~~- add eval pipeline, model comparison~~
~~- add "best model" mark. add "get_best_model"~~
~~- add Dockerfile~~
- add real prediction logging func
- add db conn / func to save inference cases (local deployment)
- add Dockerfile
- add build script to push to ECR (AWS deployment)
- add rest of AWS deployment (using S3, EC2, AWS CodePipeline)

# Commands to remember
- python ml_pipelines/deployment/local/train.py
- python ml_pipelines/deployment/local/eval.py
- python ml_pipelines/deployment/local/serve.py
- python -m ml_pipelines train_local
- python -m ml_pipelines eval_local
- python -m ml_pipelines serve_local
- sudo docker build -t ml_pipelines:latest .
- sudo docker run --rm -it ml_pipelines:latest /bin/sh
- sudo docker run --rm -it -v localpath:containerpath --env-file .env ml_pipelines:latest /bin/bash -c "python -m ml_pipelines serve_local"
- sudo docker run --rm -it -v localpath:containerpath --env-file .env --network host ml_pipelines:latest /bin/bash -c "python -m ml_pipelines serve_local"
13 changes: 13 additions & 0 deletions ml_pipelines/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import typer

from ml_pipelines.deployment.local.eval import main as eval_local
from ml_pipelines.deployment.local.serve import main as serve_local
from ml_pipelines.deployment.local.train import main as train_local

app = typer.Typer()

app.command("train_local")(train_local)
app.command("eval_local")(eval_local)
app.command("serve_local")(serve_local)

app()
4 changes: 4 additions & 0 deletions ml_pipelines/deployment/local/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,13 +100,17 @@ def get_latest_version(root_path: os.PathLike, filename: str) -> str:
root_dir = Path(root_path)
versions: list[tuple[str, float]] = []
for version_dir in root_dir.iterdir():
if not version_dir.is_dir():
continue
st_mtime = (version_dir / filename).stat().st_mtime
versions.append((version_dir.stem, st_mtime))
return max(versions, key=lambda t: t[1])[0]


def get_best_version(train_artifacts_root_path: os.PathLike):
train_dir = Path(train_artifacts_root_path)
if "best_model" not in set(f for f in train_dir.iterdir() if f.is_file()):
return None
with open(train_dir / "best_model") as f:
return f.read()

Expand Down
94 changes: 59 additions & 35 deletions ml_pipelines/deployment/local/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def run_eval_comparison_pipeline( # noqa: PLR0913
raw_data_root_path: os.PathLike,
train_versions: list[str],
train_artifacts_root_path: os.PathLike,
tag_best_model: bool,
logger: Logger,
):
logger.info(f"Running eval pipeline on model versions: {train_versions}.")
Expand All @@ -39,43 +40,66 @@ def run_eval_comparison_pipeline( # noqa: PLR0913
)
all_metrics.append((v, metrics))
best_version = max(all_metrics, key=lambda t: t[1])[0]
logger.info(f"Tagging best version as {best_version}")
tag_best_version(best_version, train_artifacts_root_path)
if tag_best_model and len(train_versions) > 1:
logger.info(f"Tagging best version as {best_version}")
tag_best_version(best_version, train_artifacts_root_path)


if __name__ == "__main__":
from dotenv import load_dotenv

load_dotenv()
RAW_DATA_ROOT_DIR = os.environ["RAW_DATA_ROOT_DIR"]
TRAIN_ARTIFACTS_ROOT_DIR = os.environ["TRAIN_ARTIFACTS_ROOT_DIR"]

def main(
raw_data_version: Union[str, None] = None, # noqa: UP007
train_versions: Union[list[str], None] = None, # noqa: UP007
raw_data_root_path: str = RAW_DATA_ROOT_DIR,
train_artifacts_root_path: str = TRAIN_ARTIFACTS_ROOT_DIR,
):
logger = Logger(__file__)
logger.addHandler(logging.StreamHandler(sys.stdout))

if raw_data_version is None:
raw_data_version = get_latest_version(
raw_data_root_path, # type: ignore
"raw_data.csv",
)

if not train_versions:
train_versions = get_all_available_train_versions( # type: ignore
train_artifacts_root_path
)

run_eval_comparison_pipeline( # noqa: PLR0913
raw_data_version=raw_data_version,
raw_data_root_path=raw_data_root_path, # type: ignore
train_versions=train_versions, # type: ignore
train_artifacts_root_path=train_artifacts_root_path, # type: ignore
logger=logger,
def main(
raw_data_root_path: Union[str, None] = None, # noqa: UP007
train_artifacts_root_path: Union[str, None] = None, # noqa: UP007, E501
raw_data_version: Union[str, None] = None, # noqa: UP007
train_versions: Union[list[str], None] = None, # noqa: UP007
tag_best_model: bool = False,
):
"""
Runs the model evaluation and comparison pipeline using local paths
for inputs and outputs.
If `raw_data_root_path` is null, the command searches for the RAW_DATA_ROOT_PATH
environment variable, and if not present, assumes this to be "/".
If `train_artifacts_root_path` is null, the command searches for the
TRAIN_ARTIFACTS_ROOT_PATH environment variable, and if not present,
assumes this to be "/".
If `raw_data_version` is null, the command searches for the latest version in
`raw_data_root_path`.
If `train_versions` is null or empty, the command automatically evaluates all
models found in `train_artifacts_root_path`.
If `tag_best_model` is set (to true) and more than one model version is evaluated,
the best performing one is tagged as the best version.
"""
logger = Logger(__file__)
logger.addHandler(logging.StreamHandler(sys.stdout))

if raw_data_root_path is None:
raw_data_root_path = os.environ.get("RAW_DATA_ROOT_PATH", "/")
if train_artifacts_root_path is None:
train_artifacts_root_path = os.environ.get("TRAIN_ARTIFACTS_ROOT_PATH", "/")

if raw_data_version is None:
raw_data_version = get_latest_version(
raw_data_root_path, # type: ignore
"raw_data.csv",
)

if not train_versions:
train_versions = get_all_available_train_versions( # type: ignore
train_artifacts_root_path # type: ignore
)

run_eval_comparison_pipeline( # noqa: PLR0913
raw_data_version=raw_data_version,
raw_data_root_path=raw_data_root_path, # type: ignore
train_versions=train_versions, # type: ignore
train_artifacts_root_path=train_artifacts_root_path, # type: ignore
tag_best_model=tag_best_model,
logger=logger,
)


if __name__ == "__main__":
typer.run(main)
Loading

0 comments on commit 26a50f7

Please sign in to comment.