diff --git a/.gitignore b/.gitignore index 69de92e5..74712a80 100644 --- a/.gitignore +++ b/.gitignore @@ -1,17 +1,17 @@ -# folders +# Environments +.venv -## IDEs +# IDEs .idea +.vscode -## Python -venv* -__pycache__ -.ipynb_checkpoints - -## Project -data +# Mac OS +.DS_Store -# files +# Python +__pycache__ -## Mac OS -.DS_Store +# Project +data/* +models/* +reports/* diff --git a/README.md b/README.md index cff8466b..e9f6e7a8 100644 --- a/README.md +++ b/README.md @@ -1,42 +1,17 @@ -# Tutorial: dvc-3-automate-experiments +# Tutorial: Automate DVC experiments -## 1. clone this repository +## 1. Create and activate virtual environment -```bash -git clone https://gitlab.com/7labs.ru/tutorials-dvc/dvc-3-automate-experiments.git -cd dvc-3-automate-experiments -``` - -## 2. Create and activate virtual environment - -Install virtualenv in advance: +Create virtual environment ```bash -pip install virtualenv +python3 -m venv .venv +echo "export PYTHONPATH=$PWD" >> .venv/bin/activate +source .venv/bin/activate ``` -Create virtual environment -```bash -virtualenv venv-dvc-3-automate-experiments -source venv-dvc-3-automate-experiments/bin/activate -``` - -## 3. Install python libraries (including dvc) +## 2. Install python libraries ```bash pip install -r requirements.txt ``` - - -## 4. Add Virtual Environment to Jupyter Notebook - -```bash -python -m ipykernel install --user --name=venv-dvc-3-automate-experiments -``` - -## 5. Run and follow Jupyter Notebook `dvc-3-automate-experiments.ipynb` for instructions: - -```bash -jupyter notebook -``` - diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 00000000..b722e9e1 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1 @@ +!.gitignore \ No newline at end of file diff --git a/dvc-3-automate-experiments.ipynb b/dvc-3-automate-experiments.ipynb deleted file mode 100644 index 74371ddd..00000000 --- a/dvc-3-automate-experiments.ipynb +++ /dev/null @@ -1,2972 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Install and init DVC\n", - "\n", - "Prerequisites: \n", - "- DVC and requirements.txt packages installed (if not - check README.md file for instructions)\n", - "- A project repository is a Git repo \n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Install with pip" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-01T07:32:18.843826Z", - "start_time": "2020-07-01T07:32:16.105734Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting dvc==1.5.0\n", - " Using cached dvc-1.5.0-py2.py3-none-any.whl (445 kB)\n", - "Collecting ruamel.yaml>=0.16.1\n", - " Using cached ruamel.yaml-0.16.10-py2.py3-none-any.whl (111 kB)\n", - "Collecting shortuuid>=0.5.0\n", - " Using cached shortuuid-1.0.1-py3-none-any.whl (7.5 kB)\n", - "Collecting shtab<2,>=1.3.0\n", - " Using cached shtab-1.3.1-py2.py3-none-any.whl (12 kB)\n", - "Collecting pydot>=1.2.4\n", - " Using cached pydot-1.4.1-py2.py3-none-any.whl (19 kB)\n", - "Collecting rich>=3.0.5\n", - " Using cached rich-5.2.0-py3-none-any.whl (145 kB)\n", - "Collecting tabulate>=0.8.7\n", - " Using cached tabulate-0.8.7-py3-none-any.whl (24 kB)\n", - "Processing /home/alex/.cache/pip/wheels/3c/33/97/805b282e129f60bb4e87cea622338f30b65f21eaf65219971f/funcy-1.14-py2.py3-none-any.whl\n", - "Processing /home/alex/.cache/pip/wheels/49/68/a0/8e7cb7bbf4990fc10b5a082aa0eb3ac66787ca11e8eca445b2/flufl.lock-3.2-py3-none-any.whl\n", - "Collecting pyasn1>=0.4.1\n", - " Using cached pyasn1-0.4.8-py2.py3-none-any.whl (77 kB)\n", - "Collecting appdirs>=1.4.3\n", - " Using cached appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)\n", - "Requirement already satisfied: setuptools>=34.0.0 in ./venv-dvc-3-automate-experiments/lib/python3.7/site-packages (from dvc==1.5.0) (47.1.1)\n", - "Collecting tqdm<5,>=4.45.0\n", - " Using cached tqdm-4.48.2-py2.py3-none-any.whl (68 kB)\n", - "Processing /home/alex/.cache/pip/wheels/bc/f8/ae/bc69cb5f61393ebf9ade4cde41d1a813d35bfe78263a26f99e/dpath-2.0.1-py3-none-any.whl\n", - "Collecting grandalf==0.6\n", - " Using cached grandalf-0.6-py3-none-any.whl (31 kB)\n", - "Processing /home/alex/.cache/pip/wheels/b8/92/aa/456d462c908b4e210c3928f778d28f94049fc9e47af8b191c9/nanotime-0.5.2-py3-none-any.whl\n", - "Collecting flatten-json<0.1.8,>=0.1.6\n", - " Using cached flatten_json-0.1.7-py3-none-any.whl (6.4 kB)\n", - "Processing /home/alex/.cache/pip/wheels/ce/22/5c/bcd55db68399954d13c8d3b23192a517dd59ba3ee8648fa773/pygtrie-2.3.2-py3-none-any.whl\n", - "Requirement already satisfied: packaging>=19.0 in ./venv-dvc-3-automate-experiments/lib/python3.7/site-packages (from dvc==1.5.0) (20.4)\n", - "Requirement already satisfied: PyYAML<5.4,>=5.1.2 in ./venv-dvc-3-automate-experiments/lib/python3.7/site-packages (from dvc==1.5.0) (5.3)\n", - "Processing /home/alex/.cache/pip/wheels/17/a2/0a/00fa5a0d6f271c82fc59be9ae47173bb6e6a462d4361224072/jsonpath_ng-1.5.1-py3-none-any.whl\n", - "Collecting colorama>=0.3.9\n", - " Using cached colorama-0.4.3-py2.py3-none-any.whl (15 kB)\n", - "Collecting toml>=0.10.1\n", - " Using cached toml-0.10.1-py2.py3-none-any.whl (19 kB)\n", - "Collecting pathspec>=0.6.0\n", - " Using cached pathspec-0.8.0-py2.py3-none-any.whl (28 kB)\n", - "Collecting gitpython>3\n", - " Using cached GitPython-3.1.7-py3-none-any.whl (158 kB)\n", - "Collecting networkx<2.5,>=2.1\n", - " Using cached networkx-2.4-py3-none-any.whl (1.6 MB)\n", - "Collecting ply>=3.9\n", - " Using cached ply-3.11-py2.py3-none-any.whl (49 kB)\n", - "Processing /home/alex/.cache/pip/wheels/0d/c4/19/13d74440f2a571841db6b6e0a273694327498884dafb9cf978/configobj-5.0.6-py3-none-any.whl\n", - "Collecting distro>=1.3.0\n", - " Using cached distro-1.5.0-py2.py3-none-any.whl (18 kB)\n", - "Collecting requests>=2.22.0\n", - " Using cached requests-2.24.0-py2.py3-none-any.whl (61 kB)\n", - "Processing /home/alex/.cache/pip/wheels/af/ee/20/047a79ba5ff692baa2f7e2e95c0cd57061a1673d59f5acf0d5/voluptuous-0.11.7-py3-none-any.whl\n", - "Collecting zc.lockfile>=1.2.1\n", - " Using cached zc.lockfile-2.0-py2.py3-none-any.whl (9.7 kB)\n", - "Collecting ruamel.yaml.clib>=0.1.2; platform_python_implementation == \"CPython\" and python_version < \"3.9\"\n", - " Using cached ruamel.yaml.clib-0.2.0-cp37-cp37m-manylinux1_x86_64.whl (547 kB)\n", - "Requirement already satisfied: pyparsing>=2.1.4 in ./venv-dvc-3-automate-experiments/lib/python3.7/site-packages (from pydot>=1.2.4->dvc==1.5.0) (2.4.7)\n", - "Collecting typing-extensions<4.0.0,>=3.7.4\n", - " Using cached typing_extensions-3.7.4.2-py3-none-any.whl (22 kB)\n", - "Collecting commonmark<0.10.0,>=0.9.0\n", - " Using cached commonmark-0.9.1-py2.py3-none-any.whl (51 kB)\n", - "Requirement already satisfied: pygments<3.0.0,>=2.6.0 in ./venv-dvc-3-automate-experiments/lib/python3.7/site-packages (from rich>=3.0.5->dvc==1.5.0) (2.6.1)\n", - "Processing /home/alex/.cache/pip/wheels/3e/5d/46/fa3cbde0ab8c53dbdd14658b3a4c97035b8851369ce8e79649/atpublic-2.0-py3-none-any.whl\n", - "Processing /home/alex/.cache/pip/wheels/8b/99/a0/81daf51dcd359a9377b110a8a886b3895921802d2fc1b2397e/future-0.18.2-cp37-none-any.whl\n", - "Requirement already satisfied: six in ./venv-dvc-3-automate-experiments/lib/python3.7/site-packages (from packaging>=19.0->dvc==1.5.0) (1.15.0)\n", - "Requirement already satisfied: decorator in ./venv-dvc-3-automate-experiments/lib/python3.7/site-packages (from jsonpath-ng>=1.5.1->dvc==1.5.0) (4.4.2)\n", - "Collecting gitdb<5,>=4.0.1\n", - " Using cached gitdb-4.0.5-py3-none-any.whl (63 kB)\n", - "Collecting certifi>=2017.4.17\n", - " Using cached certifi-2020.6.20-py2.py3-none-any.whl (156 kB)\n", - "Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1\n", - " Using cached urllib3-1.25.10-py2.py3-none-any.whl (127 kB)\n", - "Collecting chardet<4,>=3.0.2\n", - " Using cached chardet-3.0.4-py2.py3-none-any.whl (133 kB)\n", - "Collecting idna<3,>=2.5\n", - " Using cached idna-2.10-py2.py3-none-any.whl (58 kB)\n", - "Collecting smmap<4,>=3.0.1\n", - " Using cached smmap-3.0.4-py2.py3-none-any.whl (25 kB)\n", - "Installing collected packages: ruamel.yaml.clib, ruamel.yaml, shortuuid, shtab, pydot, colorama, typing-extensions, commonmark, rich, tabulate, funcy, atpublic, flufl.lock, pyasn1, appdirs, tqdm, dpath, future, grandalf, nanotime, flatten-json, pygtrie, ply, jsonpath-ng, toml, pathspec, smmap, gitdb, gitpython, networkx, configobj, distro, certifi, urllib3, chardet, idna, requests, voluptuous, zc.lockfile, dvc\n", - " Attempting uninstall: tqdm\n", - " Found existing installation: tqdm 4.42.0\n", - " Uninstalling tqdm-4.42.0:\n", - " Successfully uninstalled tqdm-4.42.0\n", - "Successfully installed appdirs-1.4.4 atpublic-2.0 certifi-2020.6.20 chardet-3.0.4 colorama-0.4.3 commonmark-0.9.1 configobj-5.0.6 distro-1.5.0 dpath-2.0.1 dvc-1.5.0 flatten-json-0.1.7 flufl.lock-3.2 funcy-1.14 future-0.18.2 gitdb-4.0.5 gitpython-3.1.7 grandalf-0.6 idna-2.10 jsonpath-ng-1.5.1 nanotime-0.5.2 networkx-2.4 pathspec-0.8.0 ply-3.11 pyasn1-0.4.8 pydot-1.4.1 pygtrie-2.3.2 requests-2.24.0 rich-5.2.0 ruamel.yaml-0.16.10 ruamel.yaml.clib-0.2.0 shortuuid-1.0.1 shtab-1.3.1 smmap-3.0.4 tabulate-0.8.7 toml-0.10.1 tqdm-4.48.2 typing-extensions-3.7.4.2 urllib3-1.25.10 voluptuous-0.11.7 zc.lockfile-2.0\n", - "\u001b[33mWARNING: You are using pip version 20.1.1; however, version 20.2.2 is available.\n", - "You should consider upgrading via the '/home/alex/Dev/Projects/tutorials/tutorials-dvc/dvc-3-automate-experiments/venv-dvc-3-automate-experiments/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" - ] - } - ], - "source": [ - "!pip install dvc==1.5.0" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Checkout branch `tutorial`" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-01T07:32:19.401395Z", - "start_time": "2020-07-01T07:32:19.271265Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Switched to a new branch 'dvc-tutorial'\r\n" - ] - } - ], - "source": [ - "!git checkout -b dvc-tutorial" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2019-06-08T11:18:29.199273Z", - "start_time": "2019-06-08T11:18:29.196865Z" - } - }, - "source": [ - "## Initialize DVC\n", - "\n", - "References: \n", - "- https://dvc.org/doc/get-started/initialize " - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-01T07:32:22.463407Z", - "start_time": "2020-07-01T07:32:21.450728Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "You can now commit the changes to git.\n", - "\n", - "\u001b[31m+---------------------------------------------------------------------+\n", - "\u001b[39m\u001b[31m|\u001b[39m \u001b[31m|\u001b[39m\n", - "\u001b[31m|\u001b[39m DVC has enabled anonymous aggregate usage analytics. \u001b[31m|\u001b[39m\n", - "\u001b[31m|\u001b[39m Read the analytics documentation (and how to opt-out) here: \u001b[31m|\u001b[39m\n", - "\u001b[31m|\u001b[39m \u001b[34mhttps://dvc.org/doc/user-guide/analytics\u001b[39m \u001b[31m|\u001b[39m\n", - "\u001b[31m|\u001b[39m \u001b[31m|\u001b[39m\n", - "\u001b[31m+---------------------------------------------------------------------+\n", - "\u001b[39m\n", - "\u001b[33mWhat's next?\u001b[39m\n", - "\u001b[33m------------\u001b[39m\n", - "- Check out the documentation: \u001b[34mhttps://dvc.org/doc\u001b[39m\n", - "- Get help and share ideas: \u001b[34mhttps://dvc.org/chat\u001b[39m\n", - "- Star us on GitHub: \u001b[34mhttps://github.com/iterative/dvc\u001b[39m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc init" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Commit changes" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-01T07:32:26.446894Z", - "start_time": "2020-07-01T07:32:26.392814Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[dvc-tutorial f285905] Initialize DVC\n", - " 6 files changed, 128 insertions(+)\n", - " create mode 100644 .dvc/.gitignore\n", - " create mode 100644 .dvc/config\n", - " create mode 100644 .dvc/plots/confusion.json\n", - " create mode 100644 .dvc/plots/default.json\n", - " create mode 100644 .dvc/plots/scatter.json\n", - " create mode 100644 .dvc/plots/smooth.json\n" - ] - } - ], - "source": [ - "%%bash\n", - "\n", - "git add .\n", - "git commit -m \"Initialize DVC\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Build automated pipelines" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create `data_load` stage\n" - ] - }, - { - "cell_type": "code", - "execution_count": 94, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:35.023136Z", - "start_time": "2020-07-03T19:30:34.904974Z" - } - }, - "outputs": [], - "source": [ - "!mkdir -p data" - ] - }, - { - "cell_type": "code", - "execution_count": 95, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:37.406056Z", - "start_time": "2020-07-03T19:30:35.351794Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Restored stage 'data_load' from run-cache \n", - "Skipping run, checking out outputs\n", - "Creating 'dvc.yaml'\n", - "Adding stage 'data_load' in 'dvc.yaml'\n", - "Generating lock file 'dvc.lock'\n", - "\n", - "To track the changes with git, run:\n", - "\n", - "\tgit add dvc.lock dvc.yaml .dvc/.gitignore\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc run -n data_load \\\n", - " -d src/data_load.py \\\n", - " -o data/iris.csv \\\n", - " -o data/classes.json \\\n", - " -p data_load \\\n", - " python src/data_load.py \\\n", - " --config=params.yaml" - ] - }, - { - "cell_type": "code", - "execution_count": 96, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:37.455211Z", - "start_time": "2020-07-03T19:30:37.433214Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4.0K\tdata/classes.json\n", - "4.0K\tdata/cm.csv\n", - "4.0K\tdata/iris.csv\n", - "8.0K\tdata/iris_featurized.csv\n", - "4.0K\tdata/metrics.json\n", - "8.0K\tdata/model.joblib\n", - "4.0K\tdata/test.csv\n", - "8.0K\tdata/train.csv\n" - ] - } - ], - "source": [ - "%%bash\n", - "\n", - "du -sh data/*" - ] - }, - { - "cell_type": "code", - "execution_count": 97, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:37.604922Z", - "start_time": "2020-07-03T19:30:37.479654Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[01;34m.\u001b[00m\r\n", - "├── README.md\r\n", - "├── \u001b[01;34mdata\u001b[00m\r\n", - "│   ├── classes.json\r\n", - "│   ├── cm.csv\r\n", - "│   ├── iris.csv\r\n", - "│   ├── iris_featurized.csv\r\n", - "│   ├── metrics.json\r\n", - "│   ├── model.joblib\r\n", - "│   ├── test.csv\r\n", - "│   └── train.csv\r\n", - "├── dvc-3-automate-experiments.ipynb\r\n", - "├── dvc.lock\r\n", - "├── dvc.yaml\r\n", - "├── params.yaml\r\n", - "├── requirements.txt\r\n", - "└── \u001b[01;34msrc\u001b[00m\r\n", - " ├── __init__.py\r\n", - " ├── data_load.py\r\n", - " ├── evaluate.py\r\n", - " ├── featurization.py\r\n", - " ├── split_dataset.py\r\n", - " └── train.py\r\n", - "\r\n", - "2 directories, 20 files\r\n" - ] - } - ], - "source": [ - "!tree -I venv-dvc-3-automate-experiments" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## dvc.yaml" - ] - }, - { - "cell_type": "code", - "execution_count": 98, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:37.727096Z", - "start_time": "2020-07-03T19:30:37.609182Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "stages:\r\n", - " data_load:\r\n", - " cmd: python src/data_load.py --config=params.yaml\r\n", - " deps:\r\n", - " - src/data_load.py\r\n", - " params:\r\n", - " - data_load\r\n", - " outs:\r\n", - " - data/classes.json\r\n", - " - data/iris.csv\r\n" - ] - } - ], - "source": [ - "!cat dvc.yaml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## params.yaml" - ] - }, - { - "cell_type": "code", - "execution_count": 99, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:37.877998Z", - "start_time": "2020-07-03T19:30:37.755666Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\r\n", - "data_load:\r\n", - " raw_data_path: data/iris.csv\r\n", - " classes_names_path: data/classes.json\r\n", - "\r\n", - "featurize:\r\n", - " features_path: data/iris_featurized.csv\r\n", - " target_column: target\r\n", - "\r\n", - "\r\n", - "data_split:\r\n", - " test_size: 0.2\r\n", - " train_path: data/train.csv\r\n", - " test_path: data/test.csv\r\n", - "\r\n", - "\r\n", - "train:\r\n", - " model_path: data/model.joblib\r\n", - "\r\n", - "\r\n", - "evaluate:\r\n", - " metrics_file: data/metrics.json\r\n", - " confusion_matrix: data/cm.csv\r\n" - ] - } - ], - "source": [ - "!cat params.yaml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Reproduce a pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 100, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:39.781553Z", - "start_time": "2020-07-03T19:30:37.923002Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stage 'data_load' didn't change, skipping \n", - "Data and pipelines are up to date.\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc repro" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Change params.yaml and reproduce \n", - "\n", - "Add a new line into `data_load` section:\n", - " `dummy_param: dummy_value`" - ] - }, - { - "cell_type": "code", - "execution_count": 101, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:41.698409Z", - "start_time": "2020-07-03T19:30:39.807607Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stage 'data_load' didn't change, skipping \n", - "Data and pipelines are up to date.\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc repro" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Build end-to-end Machine Learning pipeline\n", - "Stages \n", - "- extract features \n", - "- split dataset \n", - "- train \n", - "- evaluate \n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Add feature extraction stage" - ] - }, - { - "cell_type": "code", - "execution_count": 103, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:45.387596Z", - "start_time": "2020-07-03T19:30:43.388868Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Restored stage 'feature_extraction' from run-cache \n", - "Skipping run, checking out outputs\n", - "Adding stage 'feature_extraction' in 'dvc.yaml'\n", - "Updating lock file 'dvc.lock'\n", - "\n", - "To track the changes with git, run:\n", - "\n", - "\tgit add dvc.lock dvc.yaml\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc run -n feature_extraction \\\n", - " -d src/featurization.py \\\n", - " -d data/iris.csv \\\n", - " -o data/iris_featurized.csv \\\n", - " -p data_load,featurize \\\n", - " python src/featurization.py \\\n", - " --config=params.yaml" - ] - }, - { - "cell_type": "code", - "execution_count": 104, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:45.561869Z", - "start_time": "2020-07-03T19:30:45.439521Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "README.md params.yaml\r\n", - "\u001b[1m\u001b[36mdata\u001b[m\u001b[m requirements.txt\r\n", - "dvc-3-automate-experiments.ipynb \u001b[1m\u001b[36msrc\u001b[m\u001b[m\r\n", - "dvc.lock \u001b[1m\u001b[36mvenv-dvc-3-automate-experiments\u001b[m\u001b[m\r\n", - "dvc.yaml\r\n" - ] - } - ], - "source": [ - "!ls " - ] - }, - { - "cell_type": "code", - "execution_count": 105, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:45.706627Z", - "start_time": "2020-07-03T19:30:45.585641Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "stages:\r\n", - " data_load:\r\n", - " cmd: python src/data_load.py --config=params.yaml\r\n", - " deps:\r\n", - " - src/data_load.py\r\n", - " params:\r\n", - " - data_load\r\n", - " outs:\r\n", - " - data/classes.json\r\n", - " - data/iris.csv\r\n", - " feature_extraction:\r\n", - " cmd: python src/featurization.py --config=params.yaml\r\n", - " deps:\r\n", - " - data/iris.csv\r\n", - " - src/featurization.py\r\n", - " params:\r\n", - " - data_load\r\n", - " - featurize\r\n", - " outs:\r\n", - " - data/iris_featurized.csv\r\n" - ] - } - ], - "source": [ - "!cat dvc.yaml" - ] - }, - { - "cell_type": "code", - "execution_count": 106, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:45.745702Z", - "start_time": "2020-07-03T19:30:45.734321Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sepal_lengthsepal_widthpetal_lengthpetal_widthtarget
05.13.51.40.20
14.93.01.40.20
24.73.21.30.20
34.63.11.50.20
45.03.61.40.20
\n", - "
" - ], - "text/plain": [ - " sepal_length sepal_width petal_length petal_width target\n", - "0 5.1 3.5 1.4 0.2 0\n", - "1 4.9 3.0 1.4 0.2 0\n", - "2 4.7 3.2 1.3 0.2 0\n", - "3 4.6 3.1 1.5 0.2 0\n", - "4 5.0 3.6 1.4 0.2 0" - ] - }, - "execution_count": 106, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "\n", - "features = pd.read_csv('data/iris_featurized.csv')\n", - "features.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 107, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:45.893549Z", - "start_time": "2020-07-03T19:30:45.763986Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[31m??\u001b[m .dvc/\r\n", - "\u001b[31m??\u001b[m dvc.lock\r\n", - "\u001b[31m??\u001b[m dvc.yaml\r\n" - ] - } - ], - "source": [ - "!git status -s" - ] - }, - { - "cell_type": "code", - "execution_count": 108, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:45.961182Z", - "start_time": "2020-07-03T19:30:45.916816Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[dev 0ae7569] Add stage features_extraction\n", - " 3 files changed, 56 insertions(+)\n", - " create mode 100644 .dvc/.gitignore\n", - " create mode 100644 dvc.lock\n", - " create mode 100644 dvc.yaml\n" - ] - } - ], - "source": [ - "%%bash\n", - "git add .\n", - "git commit -m \"Add stage features_extraction\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Add split train/test stage" - ] - }, - { - "cell_type": "code", - "execution_count": 109, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:48.044867Z", - "start_time": "2020-07-03T19:30:45.984594Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Restored stage 'split_dataset' from run-cache \n", - "Skipping run, checking out outputs\n", - "Adding stage 'split_dataset' in 'dvc.yaml'\n", - "Updating lock file 'dvc.lock'\n", - "\n", - "To track the changes with git, run:\n", - "\n", - "\tgit add dvc.lock dvc.yaml\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc run -n split_dataset \\\n", - " -d src/split_dataset.py \\\n", - " -d data/iris_featurized.csv \\\n", - " -o data/train.csv \\\n", - " -o data/test.csv \\\n", - " -p featurize,data_split \\\n", - " python src/split_dataset.py \\\n", - " --config=params.yaml" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:48.186864Z", - "start_time": "2020-07-03T19:30:48.068177Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "stages:\r\n", - " data_load:\r\n", - " cmd: python src/data_load.py --config=params.yaml\r\n", - " deps:\r\n", - " - src/data_load.py\r\n", - " params:\r\n", - " - data_load\r\n", - " outs:\r\n", - " - data/classes.json\r\n", - " - data/iris.csv\r\n", - " feature_extraction:\r\n", - " cmd: python src/featurization.py --config=params.yaml\r\n", - " deps:\r\n", - " - data/iris.csv\r\n", - " - src/featurization.py\r\n", - " params:\r\n", - " - data_load\r\n", - " - featurize\r\n", - " outs:\r\n", - " - data/iris_featurized.csv\r\n", - " split_dataset:\r\n", - " cmd: python src/split_dataset.py --config=params.yaml\r\n", - " deps:\r\n", - " - data/iris_featurized.csv\r\n", - " - src/split_dataset.py\r\n", - " params:\r\n", - " - data_split\r\n", - " - featurize\r\n", - " outs:\r\n", - " - data/test.csv\r\n", - " - data/train.csv\r\n" - ] - } - ], - "source": [ - "!cat dvc.yaml" - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:48.250249Z", - "start_time": "2020-07-03T19:30:48.209429Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[dev e39a9d3] Add stage split_dataset\n", - " 2 files changed, 32 insertions(+)\n" - ] - } - ], - "source": [ - "%%bash\n", - "git add .\n", - "git commit -m \"Add stage split_dataset\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Add train stage" - ] - }, - { - "cell_type": "code", - "execution_count": 112, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:50.298161Z", - "start_time": "2020-07-03T19:30:48.275068Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Restored stage 'train' from run-cache \n", - "Skipping run, checking out outputs\n", - "Adding stage 'train' in 'dvc.yaml'\n", - "Updating lock file 'dvc.lock'\n", - "\n", - "To track the changes with git, run:\n", - "\n", - "\tgit add dvc.lock dvc.yaml\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc run -n train \\\n", - " -d src/train.py \\\n", - " -d data/train.csv \\\n", - " -o data/model.joblib \\\n", - " -p data_split,train \\\n", - " python src/train.py \\\n", - " --config=params.yaml" - ] - }, - { - "cell_type": "code", - "execution_count": 113, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:50.444828Z", - "start_time": "2020-07-03T19:30:50.324345Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "stages:\r\n", - " data_load:\r\n", - " cmd: python src/data_load.py --config=params.yaml\r\n", - " deps:\r\n", - " - src/data_load.py\r\n", - " params:\r\n", - " - data_load\r\n", - " outs:\r\n", - " - data/classes.json\r\n", - " - data/iris.csv\r\n", - " feature_extraction:\r\n", - " cmd: python src/featurization.py --config=params.yaml\r\n", - " deps:\r\n", - " - data/iris.csv\r\n", - " - src/featurization.py\r\n", - " params:\r\n", - " - data_load\r\n", - " - featurize\r\n", - " outs:\r\n", - " - data/iris_featurized.csv\r\n", - " split_dataset:\r\n", - " cmd: python src/split_dataset.py --config=params.yaml\r\n", - " deps:\r\n", - " - data/iris_featurized.csv\r\n", - " - src/split_dataset.py\r\n", - " params:\r\n", - " - data_split\r\n", - " - featurize\r\n", - " outs:\r\n", - " - data/test.csv\r\n", - " - data/train.csv\r\n", - " train:\r\n", - " cmd: python src/train.py --config=params.yaml\r\n", - " deps:\r\n", - " - data/train.csv\r\n", - " - src/train.py\r\n", - " params:\r\n", - " - data_split\r\n", - " - train\r\n", - " outs:\r\n", - " - data/model.joblib\r\n" - ] - } - ], - "source": [ - "!cat dvc.yaml" - ] - }, - { - "cell_type": "code", - "execution_count": 114, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:50.512656Z", - "start_time": "2020-07-03T19:30:50.468759Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[dev d084d1b] Add stage train\n", - " 2 files changed, 28 insertions(+)\n" - ] - } - ], - "source": [ - "%%bash\n", - "git add .\n", - "git commit -m \"Add stage train\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Add evaluate stage" - ] - }, - { - "cell_type": "code", - "execution_count": 115, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:52.746281Z", - "start_time": "2020-07-03T19:30:50.546074Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Restored stage 'evaluate' from run-cache \n", - "Skipping run, checking out outputs\n", - "Adding stage 'evaluate' in 'dvc.yaml'\n", - "Updating lock file 'dvc.lock'\n", - "\n", - "To track the changes with git, run:\n", - "\n", - "\tgit add dvc.yaml dvc.lock\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc run -n evaluate \\\n", - " -d src/evaluate.py \\\n", - " -d data/test.csv \\\n", - " -d data/model.joblib \\\n", - " -d data/classes.json \\\n", - " -m data/metrics.json \\\n", - " --plots data/cm.csv \\\n", - " -p data_load,data_split,train,evaluate \\\n", - " python src/evaluate.py \\\n", - " --config=params.yaml" - ] - }, - { - "cell_type": "code", - "execution_count": 116, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:52.886914Z", - "start_time": "2020-07-03T19:30:52.769527Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "stages:\r\n", - " data_load:\r\n", - " cmd: python src/data_load.py --config=params.yaml\r\n", - " deps:\r\n", - " - src/data_load.py\r\n", - " params:\r\n", - " - data_load\r\n", - " outs:\r\n", - " - data/classes.json\r\n", - " - data/iris.csv\r\n", - " feature_extraction:\r\n", - " cmd: python src/featurization.py --config=params.yaml\r\n", - " deps:\r\n", - " - data/iris.csv\r\n", - " - src/featurization.py\r\n", - " params:\r\n", - " - data_load\r\n", - " - featurize\r\n", - " outs:\r\n", - " - data/iris_featurized.csv\r\n", - " split_dataset:\r\n", - " cmd: python src/split_dataset.py --config=params.yaml\r\n", - " deps:\r\n", - " - data/iris_featurized.csv\r\n", - " - src/split_dataset.py\r\n", - " params:\r\n", - " - data_split\r\n", - " - featurize\r\n", - " outs:\r\n", - " - data/test.csv\r\n", - " - data/train.csv\r\n", - " train:\r\n", - " cmd: python src/train.py --config=params.yaml\r\n", - " deps:\r\n", - " - data/train.csv\r\n", - " - src/train.py\r\n", - " params:\r\n", - " - data_split\r\n", - " - train\r\n", - " outs:\r\n", - " - data/model.joblib\r\n", - " evaluate:\r\n", - " cmd: python src/evaluate.py --config=params.yaml\r\n", - " deps:\r\n", - " - data/classes.json\r\n", - " - data/model.joblib\r\n", - " - data/test.csv\r", - "\r\n", - " - src/evaluate.py\r\n", - " params:\r\n", - " - data_load\r\n", - " - data_split\r\n", - " - evaluate\r\n", - " - train\r\n", - " metrics:\r\n", - " - data/metrics.json\r\n", - " plots:\r\n", - " - data/cm.csv\r\n" - ] - } - ], - "source": [ - "!cat dvc.yaml" - ] - }, - { - "cell_type": "code", - "execution_count": 117, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:52.971253Z", - "start_time": "2020-07-03T19:30:52.919420Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[dev ecf5bc5] Add stage evaluate\n", - " 2 files changed, 46 insertions(+)\n" - ] - } - ], - "source": [ - "%%bash\n", - "git add .\n", - "git commit -m \"Add stage evaluate\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-06-28T17:23:10.812463Z", - "start_time": "2020-06-28T17:23:09.886129Z" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Experimenting with reproducible pipelines" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## How reproduce experiments?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> The most exciting part of DVC is reproducibility.\n", - ">> Reproducibility is the time you are getting benefits out of DVC instead of spending time defining the ML pipelines.\n", - "\n", - "> DVC tracks all the dependencies, which helps you iterate on ML models faster without thinking what was affected by your last change.\n", - ">> In order to track all the dependencies, DVC finds and reads ALL the DVC-files in a repository and builds a dependency graph (DAG) based on these files.\n", - "\n", - "> This is one of the differences between DVC reproducibility and traditional Makefile-like build automation tools (Make, Maven, Ant, Rakefile etc). It was designed in such a way to localize specification of DAG nodes.\n", - "If you run repro on any created DVC-file from our repository, nothing happens because nothing was changed in the defined pipeline.\n", - "\n", - "(c) dvc.org https://dvc.org/doc/tutorial/reproducibility" - ] - }, - { - "cell_type": "code", - "execution_count": 118, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:31:02.889684Z", - "start_time": "2020-07-03T19:31:00.936546Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stage 'data_load' didn't change, skipping \n", - "Stage 'feature_extraction' didn't change, skipping\n", - "Stage 'split_dataset' didn't change, skipping\n", - "Stage 'train' didn't change, skipping\n", - "Stage 'evaluate' didn't change, skipping\n", - "Data and pipelines are up to date.\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# Nothing to reproduce\n", - "!dvc repro" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Experiment 1: Add features\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create new experiment branch\n", - "\n", - "Before editing the code/featurization.py file, please create and checkout a new branch __ratio_features__" - ] - }, - { - "cell_type": "code", - "execution_count": 119, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:31:05.089755Z", - "start_time": "2020-07-03T19:31:04.832150Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Switched to a new branch 'exp1-ratio-features'\n", - " dev\u001b[m\n", - " dvc-tutorial\u001b[m\n", - "* \u001b[32mexp1-ratio-features\u001b[m\n", - " master\u001b[m\n" - ] - } - ], - "source": [ - "# create new branch\n", - "\n", - "!git checkout -b exp1-ratio-features\n", - "!git branch" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Update featurization.py" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "in file __featurization.py__ in function`get_features()` after line \n", - "\n", - "```python\n", - " features = dataset.copy()\n", - "```\n", - "\n", - "add lines:\n", - "\n", - "```python\n", - " features['sepal_length_to_sepal_width'] = features['sepal_length'] / features['sepal_width']\n", - " features['petal_length_to_petal_width'] = features['petal_length'] / features['petal_width']\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Reproduce pipeline " - ] - }, - { - "cell_type": "code", - "execution_count": 120, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:31:28.674990Z", - "start_time": "2020-07-03T19:31:25.527004Z" - }, - "scrolled": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stage 'data_load' didn't change, skipping \n", - "Running stage 'feature_extraction' with command:\n", - "\tpython src/featurization.py --config=params.yaml\n", - "Updating lock file 'dvc.lock' \n", - "\n", - "Restored stage 'split_dataset' from run-cache\n", - "Skipping run, checking out outputs\n", - "Updating lock file 'dvc.lock'\n", - "\n", - "Restored stage 'train' from run-cache\n", - "Skipping run, checking out outputs\n", - "Updating lock file 'dvc.lock'\n", - "\n", - "Restored stage 'evaluate' from run-cache\n", - "Skipping run, checking out outputs\n", - "Updating lock file 'dvc.lock'\n", - "\n", - "To track the changes with git, run:\n", - "\n", - "\tgit add dvc.lock\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc repro" - ] - }, - { - "cell_type": "code", - "execution_count": 121, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:31:28.713726Z", - "start_time": "2020-07-03T19:31:28.699701Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sepal_lengthsepal_widthpetal_lengthpetal_widthtargetsepal_length_to_sepal_widthpetal_length_to_petal_width
05.13.51.40.201.4571437.0
14.93.01.40.201.6333337.0
24.73.21.30.201.4687506.5
34.63.11.50.201.4838717.5
45.03.61.40.201.3888897.0
\n", - "
" - ], - "text/plain": [ - " sepal_length sepal_width petal_length petal_width target \\\n", - "0 5.1 3.5 1.4 0.2 0 \n", - "1 4.9 3.0 1.4 0.2 0 \n", - "2 4.7 3.2 1.3 0.2 0 \n", - "3 4.6 3.1 1.5 0.2 0 \n", - "4 5.0 3.6 1.4 0.2 0 \n", - "\n", - " sepal_length_to_sepal_width petal_length_to_petal_width \n", - "0 1.457143 7.0 \n", - "1 1.633333 7.0 \n", - "2 1.468750 6.5 \n", - "3 1.483871 7.5 \n", - "4 1.388889 7.0 " - ] - }, - "execution_count": 121, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Check features used in this pipeline\n", - "\n", - "import pandas as pd\n", - "\n", - "features = pd.read_csv('data/iris_featurized.csv')\n", - "features.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 122, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:31:28.867945Z", - "start_time": "2020-07-03T19:31:28.737094Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "On branch exp1-ratio-features\r\n", - "Changes not staged for commit:\r\n", - " (use \"git add ...\" to update what will be committed)\r\n", - " (use \"git restore ...\" to discard changes in working directory)\r\n", - "\t\u001b[31mmodified: dvc.lock\u001b[m\r\n", - "\t\u001b[31mmodified: src/featurization.py\u001b[m\r\n", - "\r\n", - "no changes added to commit (use \"git add\" and/or \"git commit -a\")\r\n" - ] - } - ], - "source": [ - "!git status" - ] - }, - { - "cell_type": "code", - "execution_count": 124, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:31:36.736663Z", - "start_time": "2020-07-03T19:31:35.023151Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path Metric Value Change \n", - "data/metrics.json f1_score 0.15385 0.0\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# Get difference with metric from previous pipeline\n", - "!dvc metrics diff --all" - ] - }, - { - "cell_type": "code", - "execution_count": 125, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:31:39.838836Z", - "start_time": "2020-07-03T19:31:39.445353Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[exp1-ratio-features 1fc8ec3] Experiment with new features\n", - " 3 files changed, 872 insertions(+), 510 deletions(-)\n", - "fatal: tag 'exp1_ratio_features' already exists\n" - ] - } - ], - "source": [ - "!git add .\n", - "!git commit -m \"Experiment with new features\"\n", - "!git tag -a \"exp1_ratio_features\" -m \"Experiment with new features\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Experiment 2: Tune Logistic Regression" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create a new experiment branch" - ] - }, - { - "cell_type": "code", - "execution_count": 127, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:32:43.387938Z", - "start_time": "2020-07-03T19:32:43.131917Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Switched to a new branch 'exp2-tuning-logreg'\n", - " dev\u001b[m\n", - " dvc-tutorial\u001b[m\n", - " exp1-ratio-features\u001b[m\n", - "* \u001b[32mexp2-tuning-logreg\u001b[m\n", - " master\u001b[m\n" - ] - } - ], - "source": [ - "# create new branch for experiment\n", - "\n", - "!git checkout -b exp2-tuning-logreg\n", - "!git branch" - ] - }, - { - "cell_type": "code", - "execution_count": 129, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:32:52.254763Z", - "start_time": "2020-07-03T19:32:50.225661Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stage 'data_load' didn't change, skipping \n", - "Stage 'feature_extraction' didn't change, skipping\n", - "Stage 'split_dataset' didn't change, skipping\n", - "Stage 'train' didn't change, skipping\n", - "Stage 'evaluate' didn't change, skipping\n", - "Data and pipelines are up to date.\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# Nothing to reproduce since code was checked out by `git checkout`\n", - "# and data files were checked out by `dvc checkout`\n", - "!dvc repro" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Tuning parameters\n", - "\n", - "in file __train.py__ :\n", - "\n", - "replace LogisticRegression params with:\n", - "\n", - "```python\n", - " clf = LogisticRegression(C=0.1, solver='newton-cg', multi_class='multinomial', max_iter=100)\n", - "```\n", - "__Note__: here we changed logistic regresssion hyperparameters: C to 0.1\n", - "\n", - "\n", - "https://dvc.org/doc/tutorials/get-started/experiments#tuning-parameters" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Reproduce pipelines" - ] - }, - { - "cell_type": "code", - "execution_count": 130, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:33:22.746410Z", - "start_time": "2020-07-03T19:33:19.314933Z" - }, - "scrolled": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stage 'data_load' didn't change, skipping \n", - "Stage 'feature_extraction' didn't change, skipping\n", - "Stage 'split_dataset' didn't change, skipping\n", - "Running stage 'train' with command:\n", - "\tpython src/train.py --config=params.yaml\n", - "Updating lock file 'dvc.lock' \n", - "\n", - "Restored stage 'evaluate' from run-cache\n", - "Skipping run, checking out outputs\n", - "Updating lock file 'dvc.lock'\n", - "\n", - "To track the changes with git, run:\n", - "\n", - "\tgit add dvc.lock\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# re-run pipeline \n", - "\n", - "!dvc repro" - ] - }, - { - "cell_type": "code", - "execution_count": 131, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:33:24.945534Z", - "start_time": "2020-07-03T19:33:24.825464Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\"f1_score\": 1.0}" - ] - } - ], - "source": [ - "# Get difference with metric from previous pipeline\n", - "!cat data/metrics.json" - ] - }, - { - "cell_type": "code", - "execution_count": 134, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:34:06.466000Z", - "start_time": "2020-07-03T19:34:05.328958Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\tdata/metrics.json: \n", - "\t\tf1_score: 1.0\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc metrics show" - ] - }, - { - "cell_type": "code", - "execution_count": 135, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:34:08.160934Z", - "start_time": "2020-07-03T19:34:06.494683Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path Metric Value Change \n", - "data/metrics.json f1_score 1.0 0.84615\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc metrics diff --all" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Commit" - ] - }, - { - "cell_type": "code", - "execution_count": 137, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:36:15.808072Z", - "start_time": "2020-07-03T19:36:15.762972Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "On branch exp2-tuning-logreg\n", - "nothing to commit, working tree clean\n" - ] - } - ], - "source": [ - "%%bash\n", - "\n", - "git add .\n", - "git commit -m \"Tune model. LogisticRegression. C=0.1\"\n", - "git tag -a \"exp2_tuning_logreg\" -m \"Tune model. LogisticRegression. C=0.1\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Experiment 3: Use SVM" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:14:01.831192Z", - "start_time": "2020-07-03T19:14:01.829062Z" - } - }, - "source": [ - "### Create a new experiment branch" - ] - }, - { - "cell_type": "code", - "execution_count": 138, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:36:20.443851Z", - "start_time": "2020-07-03T19:36:20.187021Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Switched to a new branch 'exp3-svm'\n", - " dev\u001b[m\n", - " dvc-tutorial\u001b[m\n", - " exp1-ratio-features\u001b[m\n", - " exp2-tuning-logreg\u001b[m\n", - "* \u001b[32mexp3-svm\u001b[m\n", - " master\u001b[m\n" - ] - } - ], - "source": [ - "!git checkout -b exp3-svm\n", - "!git branch" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Update train.py\n", - "\n", - "in file __train.py__ replace line\n", - "\n", - "```python\n", - " clf = LogisticRegression(C=0.1, solver='newton-cg', multi_class='multinomial', max_iter=100)\n", - "```\n", - "\n", - "with line\n", - "\n", - "```python\n", - " clf = SVC(C=0.01, kernel='linear', gamma='scale', degree=5)\n", - "```\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Reproduce pipeline " - ] - }, - { - "cell_type": "code", - "execution_count": 139, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:36:35.537208Z", - "start_time": "2020-07-03T19:36:32.544097Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stage 'data_load' didn't change, skipping \n", - "Stage 'feature_extraction' didn't change, skipping\n", - "Stage 'split_dataset' didn't change, skipping\n", - "Running stage 'train' with command:\n", - "\tpython src/train.py --config=params.yaml\n", - "Updating lock file 'dvc.lock' \n", - "\n", - "Restored stage 'evaluate' from run-cache\n", - "Skipping run, checking out outputs\n", - "Updating lock file 'dvc.lock'\n", - "\n", - "To track the changes with git, run:\n", - "\n", - "\tgit add dvc.lock\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc repro" - ] - }, - { - "cell_type": "code", - "execution_count": 140, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:36:38.995561Z", - "start_time": "2020-07-03T19:36:37.831841Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\tdata/metrics.json: \n", - "\t\tf1_score: 1.0\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc metrics show" - ] - }, - { - "cell_type": "code", - "execution_count": 141, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:36:40.521084Z", - "start_time": "2020-07-03T19:36:40.392754Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "On branch exp3-svm\r\n", - "Changes not staged for commit:\r\n", - " (use \"git add ...\" to update what will be committed)\r\n", - " (use \"git restore ...\" to discard changes in working directory)\r\n", - "\t\u001b[31mmodified: dvc.lock\u001b[m\r\n", - "\t\u001b[31mmodified: src/train.py\u001b[m\r\n", - "\r\n", - "no changes added to commit (use \"git add\" and/or \"git commit -a\")\r\n" - ] - } - ], - "source": [ - "!git status" - ] - }, - { - "cell_type": "code", - "execution_count": 142, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:36:41.766798Z", - "start_time": "2020-07-03T19:36:41.377185Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[exp3-svm 1474ec0] Experiment 3 with SVM estimator\r\n", - " 2 files changed, 5 insertions(+), 4 deletions(-)\r\n" - ] - } - ], - "source": [ - "!git add .\n", - "!git commit -m \"Experiment 3 with SVM estimator\"\n", - "!git tag -a \"exp3_svm\" -m \"Experiment 3 with SVM estimator\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Merge best experiment `dvc-tutorial ` branch" - ] - }, - { - "cell_type": "code", - "execution_count": 153, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:45:17.537969Z", - "start_time": "2020-07-03T19:45:17.463715Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Auto-merging src/train.py\n", - "CONFLICT (content): Merge conflict in src/train.py\n", - "Auto-merging src/featurization.py\n", - "CONFLICT (add/add): Merge conflict in dvc.lock\n", - "Auto-merging dvc.lock\n", - "Auto-merging dvc-3-automate-experiments.ipynb\n", - "CONFLICT (content): Merge conflict in dvc-3-automate-experiments.ipynb\n", - "Automatic merge failed; fix conflicts and then commit the result.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Previous HEAD position was 1474ec0 Experiment 3 with SVM estimator\n", - "Switched to branch 'dvc-tutorial'\n" - ] - }, - { - "ename": "CalledProcessError", - "evalue": "Command 'b'\\ngit checkout dvc-tutorial \\ngit merge exp3_svm\\n'' returned non-zero exit status 1.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mCalledProcessError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_cell_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'bash'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'\\ngit checkout dvc-tutorial \\ngit merge exp3_svm\\n'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py\u001b[0m in \u001b[0;36mrun_cell_magic\u001b[0;34m(self, magic_name, line, cell)\u001b[0m\n\u001b[1;32m 2350\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuiltin_trap\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2351\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mmagic_arg_s\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2352\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2353\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2354\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/IPython/core/magics/script.py\u001b[0m in \u001b[0;36mnamed_script_magic\u001b[0;34m(line, cell)\u001b[0m\n\u001b[1;32m 140\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 141\u001b[0m \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mscript\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 142\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshebang\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 143\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 144\u001b[0m \u001b[0;31m# write a basic docstring:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m\u001b[0m in \u001b[0;36mshebang\u001b[0;34m(self, line, cell)\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/IPython/core/magic.py\u001b[0m in \u001b[0;36m\u001b[0;34m(f, *a, **k)\u001b[0m\n\u001b[1;32m 185\u001b[0m \u001b[0;31m# but it's overkill for just that one bit of state.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 186\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmagic_deco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 187\u001b[0;31m \u001b[0mcall\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 188\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 189\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcallable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/IPython/core/magics/script.py\u001b[0m in \u001b[0;36mshebang\u001b[0;34m(self, line, cell)\u001b[0m\n\u001b[1;32m 243\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstderr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflush\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 244\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraise_error\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturncode\u001b[0m\u001b[0;34m!=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 245\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mCalledProcessError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturncode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstderr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 246\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_run_script\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mto_close\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mCalledProcessError\u001b[0m: Command 'b'\\ngit checkout dvc-tutorial \\ngit merge exp3_svm\\n'' returned non-zero exit status 1." - ] - } - ], - "source": [ - "%%bash\n", - "\n", - "git checkout dvc-tutorial \n", - "git merge exp3_svm" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Compare experiment" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compare params " - ] - }, - { - "cell_type": "code", - "execution_count": 147, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:39:20.728429Z", - "start_time": "2020-07-03T19:39:19.065249Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[0m " - ] - } - ], - "source": [ - "# Get params diffs \n", - "\n", - "!dvc params diff" - ] - }, - { - "cell_type": "code", - "execution_count": 148, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:39:29.288964Z", - "start_time": "2020-07-03T19:39:27.598159Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path Param Old New \n", - "params.yaml data_load.classes_names_path data/classes.json data/classes.json\n", - "params.yaml data_load.raw_data_path data/iris.csv data/iris.csv\n", - "params.yaml data_split.test_path data/test.csv data/test.csv\n", - "params.yaml data_split.test_size 0.2 0.2\n", - "params.yaml data_split.train_path data/train.csv data/train.csv\n", - "params.yaml evaluate.confusion_matrix data/cm.csv data/cm.csv\n", - "params.yaml evaluate.metrics_file data/metrics.json data/metrics.json\n", - "params.yaml featurize.features_path data/iris_featurized.csv data/iris_featurized.csv\n", - "params.yaml featurize.target_column target target\n", - "params.yaml train.model_path data/model.joblib data/model.joblib\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# Compare parameters with a specific commit, a tag or any revision\n", - "\n", - "!dvc params diff --all" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:09:20.304575Z", - "start_time": "2020-07-03T19:09:18.649548Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\"params.yaml\": {\"evaluate.metrics_file\": {\"old\": \"data/metrics.json\", \"new\": \"data/metrics.json\"}, \"featurize.features_path\": {\"old\": \"data/iris_featurized.csv\", \"new\": \"data/iris_featurized.csv\"}, \"data_load.classes_names_path\": {\"old\": \"data/classes.json\", \"new\": \"data/classes.json\"}, \"data_split.test_path\": {\"old\": \"data/test.csv\", \"new\": \"data/test.csv\"}, \"train.model_path\": {\"old\": \"data/model.joblib\", \"new\": \"data/model.joblib\"}, \"featurize.target_column\": {\"old\": \"target\", \"new\": \"target\"}, \"data_load.raw_data_path\": {\"old\": \"data/iris.csv\", \"new\": \"data/iris.csv\"}, \"evaluate.confusion_matrix\": {\"old\": \"data/cm.csv\", \"new\": \"data/cm.csv\"}, \"data_split.test_size\": {\"old\": 0.2, \"new\": 0.2, \"diff\": 0.0}, \"data_split.train_path\": {\"old\": \"data/train.csv\", \"new\": \"data/train.csv\"}}}\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc params diff --show-json --all" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:09:27.495017Z", - "start_time": "2020-07-03T19:09:25.848748Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "| Path | Param | Old | New |\n", - "|-------------|------------------------------|--------------------------|--------------------------|\n", - "| params.yaml | data_load.classes_names_path | data/classes.json | data/classes.json |\n", - "| params.yaml | data_load.raw_data_path | data/iris.csv | data/iris.csv |\n", - "| params.yaml | data_split.test_path | data/test.csv | data/test.csv |\n", - "| params.yaml | data_split.test_size | 0.2 | 0.2 |\n", - "| params.yaml | data_split.train_path | data/train.csv | data/train.csv |\n", - "| params.yaml | evaluate.confusion_matrix | data/cm.csv | data/cm.csv |\n", - "| params.yaml | evaluate.metrics_file | data/metrics.json | data/metrics.json |\n", - "| params.yaml | featurize.features_path | data/iris_featurized.csv | data/iris_featurized.csv |\n", - "| params.yaml | featurize.target_column | target | target |\n", - "| params.yaml | train.model_path | data/model.joblib | data/model.joblib |\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc params diff --show-md --all" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:00:44.847802Z", - "start_time": "2020-07-03T19:00:44.717758Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mcommit 736c92a6eeda6261f528d7a2e2d4db4cb306fa03\u001b[m\u001b[33m (\u001b[m\u001b[1;36mHEAD -> \u001b[m\u001b[1;32mexp2-svm\u001b[m\u001b[33m, \u001b[m\u001b[1;33mtag: exp2_svm\u001b[m\u001b[33m)\u001b[m\r\n", - "Author: Mikhail \r\n", - "Date: Fri Jul 3 21:49:25 2020 +0300\r\n", - "\r\n", - " Experiment 2 with SVM estimator\r\n", - "\r\n", - "\u001b[33mcommit 24f75fdcc9bede20cbecf88697b5d3f8ed56f58c\u001b[m\u001b[33m (\u001b[m\u001b[1;33mtag: exp1_ratio_features\u001b[m\u001b[33m, \u001b[m\u001b[1;32mexp1-ratio-features\u001b[m\u001b[33m)\u001b[m\r\n", - "Author: Mikhail \r\n", - "Date: Fri Jul 3 21:48:42 2020 +0300\r\n", - "\r\n", - " Experiment with new features\r\n", - "\r\n", - "\u001b[33mcommit 34a0bc667f86c3b5e388bef672eb598b8a6a7788\u001b[m\u001b[33m (\u001b[m\u001b[1;32mdvc-tutorial\u001b[m\u001b[33m)\u001b[m\r\n", - "Author: Mikhail \r\n", - "Date: Wed Jul 1 10:35:03 2020 +0300\r\n", - "\r\n", - " Add stage evaluate\r\n", - "\r\n", - "\u001b[33mcommit 4c45a4ff702106d78bbaf8d356e0e95ca268e05b\u001b[m\r\n", - "Author: Mikhail \r\n", - "Date: Wed Jul 1 10:34:09 2020 +0300\r\n", - "\r\n", - " Add stage train\r\n", - "\r\n", - "\u001b[33mcommit f41781d2c4855762c4405636491bc014cc00bd20\u001b[m\r\n", - "Author: Mikhail \r\n", - "Date: Wed Jul 1 10:34:00 2020 +0300\r\n", - "\r\n", - " Add stage split_dataset\r\n", - "\r\n", - "\u001b[33mcommit dbfc854a931baf57ad116f811c2cea39d4fb69a9\u001b[m\r\n", - "Author: Mikhail \r\n", - "Date: Wed Jul 1 10:33:51 2020 +0300\r\n", - "\r\n", - " Add stage features_extraction\r\n", - "\r\n", - "\u001b[33mcommit f2859056db4c53e11ba0593388fddd19018d577b\u001b[m\r\n", - "Author: Mikhail \r\n", - "Date: Wed Jul 1 10:32:26 2020 +0300\r\n", - "\r\n", - " Initialize DVC\r\n", - "\r\n", - "\u001b[33mcommit 1102dc2e3f636b2d37558f95a960c788f3de32ed\u001b[m\u001b[33m (\u001b[m\u001b[1;31morigin/dev\u001b[m\u001b[33m, \u001b[m\u001b[1;32mdev\u001b[m\u001b[33m)\u001b[m\r\n", - "Merge: 855c61a 92ac211\r\n", - "Author: Mikhail \r\n", - "Date: Wed Jul 1 07:22:32 2020 +0000\r\n", - "\r\n", - " Merge branch 'update-confusion-matrix' into 'dev'\r\n", - " \r\n", - " update confusion matrix\r\n", - " \r\n", - " See merge request 7labs.ru/tutorials-dvc/dvc-3-automate-experiments!4\r\n", - "\r\n", - "\u001b[33mcommit 92ac211f2139095965d0e26304d2d39003136def\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Tue Jun 30 13:08:30 2020 +0900\r\n", - "\r\n", - " update confusion matrix\r\n", - "\r\n", - "\u001b[33mcommit 855c61ac3f02f8938445fe749846e20d01e0f247\u001b[m\r\n", - "Merge: 22aeb23 7fbf4d8\r\n", - "Author: Alexander Kolosov \r\n", - "Date: Mon Jun 29 08:47:37 2020 +0000\r\n", - "\r\n", - " Merge branch 'dev-update-pipelines' into 'dev'\r\n", - " \r\n", - " Dev update pipelines\r\n", - " \r\n", - " See merge request 7labs.ru/tutorials-dvc/dvc-3-automate-experiments!3\r\n", - "\r\n", - "\u001b[33mcommit 7fbf4d8f4e54be947f77dce09191b4f6fbb287f0\u001b[m\r\n", - "Author: Mikhail \r\n", - "Date: Mon Jun 29 08:47:37 2020 +0000\r\n", - "\r\n", - " Dev update pipelines\r\n", - "\r\n", - "\u001b[33mcommit 22aeb23eb6b54f12f11c76a5714dbf6bff5f11f9\u001b[m\r\n", - "Author: Mikhail \r\n", - "Date: Sun Jun 28 19:02:29 2020 +0300\r\n", - "\r\n", - " Update name of tutorial and notebook\r\n", - "\r\n", - "\u001b[33mcommit 110a584e41fa7c140bbaf8130f70d4112e58d1a4\u001b[m\r\n", - "Merge: 2d7e834 a8d3200\r\n", - "Author: Mikhail \r\n", - "Date: Sat Jun 27 07:49:11 2020 +0000\r\n", - "\r\n", - " Merge branch 'update-software' into 'dev'\r\n", - " \r\n", - " Update software\r\n", - " \r\n", - " See merge request 7labs.ru/tutorials-dvc/dvc-3-automate-experiments!2\r\n", - "\r\n", - "\u001b[33mcommit a8d3200b8cbffdc4af1c7204710d217e9f685928\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Fri Jun 26 17:58:32 2020 +0900\r\n", - "\r\n", - " intall toc for jupyter notebook\r\n", - "\r\n", - "\u001b[33mcommit 8b042ad196928f9584b4bbce058625896af78d9d\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Fri Jun 26 17:58:12 2020 +0900\r\n", - "\r\n", - " upgrade dvc\r\n", - "\r\n", - "\u001b[33mcommit 2d7e834a6d115d1b47253377b3baaace559e3259\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Thu Jun 11 12:53:18 2020 +0900\r\n", - "\r\n", - " add data/ to .gitignore\r\n", - "\r\n", - "\u001b[33mcommit 8817b3ed1f82ed1c4feb9122d49237b37356e70e\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Wed Jun 10 22:56:32 2020 +0900\r\n", - "\r\n", - " update Lesson 4.ipynb: append description of dvc plots diff\r\n", - "\r\n", - "\u001b[33mcommit a8db726c3f368c39180d61d21f21bf6727db20c0\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Wed Jun 10 22:44:00 2020 +0900\r\n", - "\r\n", - " update Lesson 4.ipynb: add section for dvc metrics diff and dvc plots\r\n", - "\r\n", - "\u001b[33mcommit 77559e316fe6b5fd0a11f27a06fbc9eed1c2b606\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Wed Jun 10 22:43:09 2020 +0900\r\n", - "\r\n", - " update src/evaluate.py: put metric and confusion matrix in separated files\r\n", - "\r\n", - "\u001b[33mcommit a0afac2ff2dc7c5815c72ec3770888b67e5f04e7\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Wed Jun 10 12:05:51 2020 +0900\r\n", - "\r\n", - " refactor code modules\r\n", - "\r\n", - "\u001b[33mcommit 73846297879b1f1be3868c64e73b7d8ad6966b09\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Wed Jun 10 12:04:37 2020 +0900\r\n", - "\r\n", - " fix Lesson 4.ipynb\r\n", - "\r\n", - "\u001b[33mcommit b6ba776f8607c6481e34f8a40af4c23a5cd36990\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Tue Jun 9 19:27:13 2020 +0900\r\n", - "\r\n", - " create repo structure for lesson 4\r\n" - ] - } - ], - "source": [ - "# To see the difference between two specific commits, both need to be specified:\n", - "\n", - "!git log" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:01:12.974894Z", - "start_time": "2020-07-03T19:01:11.320625Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[0m " - ] - } - ], - "source": [ - "\n", - "!dvc params diff 24f75fdcc9bede20cbecf88697b5d3f8ed56f58c HEAD^" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Show metrics" - ] - }, - { - "cell_type": "code", - "execution_count": 149, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:42:07.828077Z", - "start_time": "2020-07-03T19:42:06.658092Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\tdata/metrics.json: \n", - "\t\tf1_score: 1.0\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# this pipeline metrics \n", - "\n", - "!dvc metrics show" - ] - }, - { - "cell_type": "code", - "execution_count": 150, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:42:10.492627Z", - "start_time": "2020-07-03T19:42:09.201160Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "workspace: \n", - "\tdata/metrics.json:\n", - "\t\tf1_score: 1.0\n", - "dev:\n", - "\tdata/metrics.json:\n", - "\t\tf1_score: 1.0\n", - "dvc-tutorial:\n", - "\tdata/metrics.json:\n", - "\t\tf1_score: 0.9305555555555555\n", - "exp1-ratio-features:\n", - "\tdata/metrics.json:\n", - "\t\tf1_score: 0.15384615384615383\n", - "exp2-tuning-logreg:\n", - "\tdata/metrics.json:\n", - "\t\tf1_score: 1.0\n", - "exp3-svm:\n", - "\tdata/metrics.json:\n", - "\t\tf1_score: 1.0\n", - "exp2_tuning_logreg:\n", - "\tdata/metrics.json:\n", - "\t\tf1_score: 1.0\n", - "exp3_svm:\n", - "\tdata/metrics.json:\n", - "\t\tf1_score: 1.0\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# show all commited pipelines metrics (all branch and tags)\n", - "\n", - "!dvc metrics show -a -T" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compare metrics (get differences)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 151, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:43:27.774038Z", - "start_time": "2020-07-03T19:43:26.104962Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[0m " - ] - } - ], - "source": [ - "!dvc metrics diff" - ] - }, - { - "cell_type": "code", - "execution_count": 152, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:44:46.444858Z", - "start_time": "2020-07-03T19:44:44.738955Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path Metric Value Change \n", - "data/metrics.json f1_score 1.0 0.0\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# --all - list all metrics, even those without changes\n", - "\n", - "!dvc metrics diff --all" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* чтобы сравнить текущую метрики из текущего коммита и из другого, нужно указать другой (old) коммит:" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:11:04.120125Z", - "start_time": "2020-07-03T19:11:02.460457Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path Metric Value Change \n", - "data/metrics.json f1_score 1.0 0.84615\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# Compare old and new branches\n", - "\n", - "\n", - "!dvc metrics diff exp1-ratio-features exp2-svm" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:10:59.357203Z", - "start_time": "2020-07-03T19:10:57.708759Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path Metric Value Change \n", - "data/metrics.json f1_score 0.93056 0.77671\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# Equivalent to `!dvc metrics diff exp1-ratio-features dvc-tutorial`, because dvc-tutorial - current branch\n", - "\n", - "!dvc metrics diff exp1-ratio-features" - ] - }, - { - "cell_type": "code", - "execution_count": 157, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:50:29.269796Z", - "start_time": "2020-07-03T19:50:29.132897Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Switched to branch 'dev'\r\n", - "Your branch is ahead of 'origin/dev' by 7 commits.\r\n", - " (use \"git push\" to publish your local commits)\r\n" - ] - } - ], - "source": [ - "!git checkout dev -f" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* чтобы выводить не только новую, но и старую метрики, нужно добавить опцию --old" - ] - }, - { - "cell_type": "code", - "execution_count": 154, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:48:02.485718Z", - "start_time": "2020-07-03T19:48:01.562562Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[31mERROR\u001b[39m: failed to show metrics diff - unable to read: 'dvc.lock', YAML file structure is corrupted: while scanning a simple key\n", - " in \"\", line 22, column 1\n", - "could not find expected ':'\n", - " in \"\", line 23, column 8\n", - "\n", - "\u001b[33mHaving any troubles?\u001b[39m Hit us up at \u001b[34mhttps://dvc.org/support\u001b[39m, we are always happy to help!\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# Use --old to show both old and new metrics vlues \n", - "\n", - "!dvc metrics diff --old exp1-ratio-features exp2-svm" - ] - }, - { - "cell_type": "code", - "execution_count": 158, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:50:33.253819Z", - "start_time": "2020-07-03T19:50:31.570404Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "| Path | Metric | Value | Change | \n", - "|--------|----------|---------|----------|\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc metrics diff --show-md" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Build Plots\n" - ] - }, - { - "cell_type": "code", - "execution_count": 165, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T20:08:18.872602Z", - "start_time": "2020-07-03T20:08:18.869605Z" - } - }, - "outputs": [], - "source": [ - "from IPython.display import IFrame" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Show" - ] - }, - { - "cell_type": "code", - "execution_count": 176, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T20:10:21.387140Z", - "start_time": "2020-07-03T20:10:20.271263Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "file:///Users/mnrozhkov/dev/dvc/course/dvc-3-automate-experiments/data/plots-show.html\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc plots show --template confusion \"data/cm.csv\" -x actual -y predicted -o data/plots-show.html" - ] - }, - { - "cell_type": "code", - "execution_count": 177, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T20:10:21.421474Z", - "start_time": "2020-07-03T20:10:21.416923Z" - }, - "scrolled": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 177, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "IFrame(src='data/plots-show.html', width=500, height=500)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Diff" - ] - }, - { - "cell_type": "code", - "execution_count": 192, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T20:27:04.674839Z", - "start_time": "2020-07-03T20:27:03.879598Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "file:///Users/mnrozhkov/dev/dvc/course/dvc-3-automate-experiments/data/plots-diff.html\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# Build metircs plots for all 3 experiments\n", - "!dvc plots diff -t confusion -o data/plots-diff.html exp1-ratio-features exp3-svm -x predicted" - ] - }, - { - "cell_type": "code", - "execution_count": 194, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T20:27:34.434387Z", - "start_time": "2020-07-03T20:27:34.430369Z" - }, - "scrolled": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 194, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "IFrame(src='data/plots-diff.html', width=1000, height=400)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.2" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": false, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": { - "height": "calc(100% - 180px)", - "left": "10px", - "top": "150px", - "width": "230.953px" - }, - "toc_section_display": true, - "toc_window_display": true - }, - "varInspector": { - "cols": { - "lenName": 16, - "lenType": 16, - "lenVar": 40 - }, - "kernels_config": { - "python": { - "delete_cmd_postfix": "", - "delete_cmd_prefix": "del ", - "library": "var_list.py", - "varRefreshCmd": "print(var_dic_list())" - }, - "r": { - "delete_cmd_postfix": ") ", - "delete_cmd_prefix": "rm(", - "library": "var_list.r", - "varRefreshCmd": "cat(var_dic_list()) " - } - }, - "types_to_exclude": [ - "module", - "function", - "builtin_function_or_method", - "instance", - "_Feature" - ], - "window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/models/.gitignore b/models/.gitignore new file mode 100644 index 00000000..b722e9e1 --- /dev/null +++ b/models/.gitignore @@ -0,0 +1 @@ +!.gitignore \ No newline at end of file diff --git a/params.yaml b/params.yaml index 9bb859cd..933640bd 100644 --- a/params.yaml +++ b/params.yaml @@ -1,8 +1,8 @@ - data_load: raw_data_path: data/iris.csv classes_names_path: data/classes.json + featurize: features_path: data/iris_featurized.csv target_column: target @@ -15,9 +15,9 @@ data_split: train: - model_path: data/model.joblib + model_path: models/model.joblib evaluate: - metrics_file: data/metrics.json - confusion_matrix: data/cm.csv + metrics_file: reports/metrics.json + confusion_matrix: reports/cm.csv diff --git a/reports/.gitignore b/reports/.gitignore new file mode 100644 index 00000000..b722e9e1 --- /dev/null +++ b/reports/.gitignore @@ -0,0 +1 @@ +!.gitignore \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index d470460a..79a1c05b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,10 @@ -joblib==0.15.1 -jupyter==1.0.0 -jupyter_contrib_nbextensions==0.5.1 -ipykernel==5.3.0 -matplotlib==3.1.2 -numpy==1.18.1 -pandas==1.0.0 -pyyaml==5.3 -scikit-learn==0.23.1 -scipy==1.4.1 -tqdm==4.42.0 \ No newline at end of file +dvc==2.57.2 +joblib==1.2.0 +matplotlib==3.7.1 +numpy==1.24.3 +pandas==2.0.1 +python-box==7.0.1 +pyyaml==6.0 +scikit-learn==1.2.2 +scipy==1.10.1 +tqdm==4.65.0 \ No newline at end of file diff --git a/src/data_load.py b/src/data_load.py index b07a8258..04005193 100644 --- a/src/data_load.py +++ b/src/data_load.py @@ -2,7 +2,8 @@ import json from sklearn.datasets import load_iris from typing import Text -import yaml + +from src.utils import load_config def data_load(config_path: Text) -> None: @@ -12,18 +13,16 @@ def data_load(config_path: Text) -> None: config_path {Text}: path to config """ - config = yaml.safe_load(open(config_path)) - raw_data_path = config['data_load']['raw_data_path'] - classes_names_path = config['data_load']['classes_names_path'] + config = load_config(config_path) data = load_iris(as_frame=True) classes_names = data.target_names.tolist() dataset = data.frame dataset.columns = [colname.strip(' (cm)').replace(' ', '_') for colname in dataset.columns.tolist()] - dataset.to_csv(raw_data_path, index=False) + dataset.to_csv(config.data_load.raw_data_path, index=False) - with open(classes_names_path, 'w') as classes_names_file: + with open(config.data_load.classes_names_path, 'w') as classes_names_file: json.dump(obj={'classes_names': classes_names}, fp=classes_names_file) diff --git a/src/evaluate.py b/src/evaluate.py index bc0d6098..d9c4e428 100644 --- a/src/evaluate.py +++ b/src/evaluate.py @@ -4,7 +4,8 @@ import pandas as pd from sklearn.metrics import f1_score from typing import Text -import yaml + +from src.utils import load_config def evaluate(config_path: Text) -> None: @@ -13,40 +14,29 @@ def evaluate(config_path: Text) -> None: config_path {Text}: path to config """ - config = yaml.safe_load(open(config_path)) - classes_names_path = config['data_load']['classes_names_path'] - test_dataset_path = config['data_split']['test_path'] - model_path = config['train']['model_path'] - metrics_path = config['evaluate']['metrics_file'] - confusion_matrix_path = config['evaluate']['confusion_matrix'] + config = load_config(config_path) - classes = json.load(open(classes_names_path))['classes_names'] + classes = json.load(open(config.data_load.classes_names_path))['classes_names'] - test_dataset = pd.read_csv(test_dataset_path) + test_dataset = pd.read_csv(config.data_split.test_path) y = test_dataset.loc[:, 'target'].values.astype('float32') X = test_dataset.drop('target', axis=1).values - clf = joblib.load(model_path) + clf = joblib.load(config.train.model_path) prediction = clf.predict(X) f1 = f1_score(y_true=y, y_pred=prediction, average='macro') json.dump( obj={'f1_score': f1}, - fp=open(metrics_path, 'w') + fp=open(config.evaluate.metrics_file, 'w') ) - # pd.DataFrame({'actual': y, 'predicted': prediction}).apply( - # lambda series: series.map( - # {i: cls_name for i, cls_name in enumerate(classes)} - # ) - # ).to_csv(confusion_matrix_path, index=False) - mapping = {i: cls_name for i, cls_name in enumerate(classes)} cmdf = pd.DataFrame( {'actual': y, 'predicted': prediction} ).apply(lambda series: series.map(mapping)) - cmdf.to_csv(confusion_matrix_path, index=False) + cmdf.to_csv(config.evaluate.confusion_matrix, index=False) if __name__ == '__main__': @@ -56,4 +46,3 @@ def evaluate(config_path: Text) -> None: args = args_parser.parse_args() evaluate(config_path=args.config) - diff --git a/src/featurization.py b/src/featurization.py index 2306f34f..9aea31a6 100644 --- a/src/featurization.py +++ b/src/featurization.py @@ -1,7 +1,8 @@ import argparse import pandas as pd from typing import Text -import yaml + +from src.utils import load_config def get_features(dataset): @@ -17,13 +18,11 @@ def featurize(config_path: Text) -> None: config_path {Text}: path to config """ - config = yaml.safe_load(open(config_path)) - raw_data_path = config['data_load']['raw_data_path'] - featurized_dataset_path = config['featurize']['features_path'] + config = load_config(config_path) - dataset = pd.read_csv(raw_data_path) + dataset = pd.read_csv(config.data_load.raw_data_path) features = get_features(dataset) - features.to_csv(featurized_dataset_path, index=False) + features.to_csv(config.featurize.features_path, index=False) if __name__ == '__main__': diff --git a/src/split_dataset.py b/src/split_dataset.py index ffd6f119..8b8bd38a 100644 --- a/src/split_dataset.py +++ b/src/split_dataset.py @@ -2,7 +2,8 @@ from sklearn.model_selection import train_test_split import pandas as pd from typing import Text -import yaml + +from src.utils import load_config def split_train_test(config_path: Text) -> None: @@ -11,20 +12,15 @@ def split_train_test(config_path: Text) -> None: config_path {Text}: path to config """ - config = yaml.safe_load(open(config_path)) - featurized_dataset_path = config['featurize']['features_path'] - train_dataset_path = config['data_split']['train_path'] - test_dataset_path = config['data_split']['test_path'] - test_size = config['data_split']['test_size'] - - dataset = pd.read_csv(featurized_dataset_path) + config = load_config(config_path) + dataset = pd.read_csv(config.featurize.features_path) # Split in train/test - + test_size = config.data_split.test_size df_train, df_test = train_test_split(dataset, test_size=test_size, random_state=42) - df_train.to_csv(train_dataset_path, index=False) - df_test.to_csv(test_dataset_path, index=False) + df_train.to_csv(config.data_split.train_path, index=False) + df_test.to_csv(config.data_split.test_path, index=False) if __name__ == '__main__': diff --git a/src/train.py b/src/train.py index fefd056e..56f39c73 100644 --- a/src/train.py +++ b/src/train.py @@ -4,7 +4,8 @@ from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from typing import Text -import yaml + +from src.utils import load_config def train(config_path: Text) -> None: @@ -13,11 +14,9 @@ def train(config_path: Text) -> None: config_path {Text}: path to config """ - config = yaml.safe_load(open(config_path)) - train_dataset_path = config['data_split']['train_path'] - model_path = config['train']['model_path'] + config = load_config(config_path) # Load train set - train_dataset = pd.read_csv(train_dataset_path) + train_dataset = pd.read_csv(config.data_split.train_path) # Get X and Y y = train_dataset.loc[:, 'target'].values.astype('float32') @@ -27,7 +26,7 @@ def train(config_path: Text) -> None: clf = LogisticRegression(C=0.00001, solver='lbfgs', multi_class='multinomial', max_iter=100) clf.fit(X, y) - joblib.dump(clf, model_path) + joblib.dump(clf, config.train.model_path) if __name__ == '__main__': @@ -36,4 +35,4 @@ def train(config_path: Text) -> None: args_parser.add_argument('--config', dest='config', required=True) args = args_parser.parse_args() - train(config_path=args.config) \ No newline at end of file + train(config_path=args.config) diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 00000000..ce4b6a50 --- /dev/null +++ b/src/utils.py @@ -0,0 +1,19 @@ +import box +from typing import Text +import yaml + + +def load_config(config_path: Text) -> box.ConfigBox: + """Loads yaml config in instance of box.ConfigBox. + Args: + config_path {Text}: path to config + Returns: + box.ConfigBox + """ + + with open(config_path) as config_file: + + config = yaml.safe_load(config_file) + config = box.ConfigBox(config) + + return config