diff --git a/.gitignore b/.gitignore
index 69de92e5..74712a80 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,17 +1,17 @@
-# folders
+# Environments
+.venv
-## IDEs
+# IDEs
.idea
+.vscode
-## Python
-venv*
-__pycache__
-.ipynb_checkpoints
-
-## Project
-data
+# Mac OS
+.DS_Store
-# files
+# Python
+__pycache__
-## Mac OS
-.DS_Store
+# Project
+data/*
+models/*
+reports/*
diff --git a/README.md b/README.md
index cff8466b..e9f6e7a8 100644
--- a/README.md
+++ b/README.md
@@ -1,42 +1,17 @@
-# Tutorial: dvc-3-automate-experiments
+# Tutorial: Automate DVC experiments
-## 1. clone this repository
+## 1. Create and activate virtual environment
-```bash
-git clone https://gitlab.com/7labs.ru/tutorials-dvc/dvc-3-automate-experiments.git
-cd dvc-3-automate-experiments
-```
-
-## 2. Create and activate virtual environment
-
-Install virtualenv in advance:
+Create virtual environment
```bash
-pip install virtualenv
+python3 -m venv .venv
+echo "export PYTHONPATH=$PWD" >> .venv/bin/activate
+source .venv/bin/activate
```
-Create virtual environment
-```bash
-virtualenv venv-dvc-3-automate-experiments
-source venv-dvc-3-automate-experiments/bin/activate
-```
-
-## 3. Install python libraries (including dvc)
+## 2. Install python libraries
```bash
pip install -r requirements.txt
```
-
-
-## 4. Add Virtual Environment to Jupyter Notebook
-
-```bash
-python -m ipykernel install --user --name=venv-dvc-3-automate-experiments
-```
-
-## 5. Run and follow Jupyter Notebook `dvc-3-automate-experiments.ipynb` for instructions:
-
-```bash
-jupyter notebook
-```
-
diff --git a/data/.gitignore b/data/.gitignore
new file mode 100644
index 00000000..b722e9e1
--- /dev/null
+++ b/data/.gitignore
@@ -0,0 +1 @@
+!.gitignore
\ No newline at end of file
diff --git a/dvc-3-automate-experiments.ipynb b/dvc-3-automate-experiments.ipynb
deleted file mode 100644
index 74371ddd..00000000
--- a/dvc-3-automate-experiments.ipynb
+++ /dev/null
@@ -1,2972 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Install and init DVC\n",
- "\n",
- "Prerequisites: \n",
- "- DVC and requirements.txt packages installed (if not - check README.md file for instructions)\n",
- "- A project repository is a Git repo \n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Install with pip"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-01T07:32:18.843826Z",
- "start_time": "2020-07-01T07:32:16.105734Z"
- },
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Collecting dvc==1.5.0\n",
- " Using cached dvc-1.5.0-py2.py3-none-any.whl (445 kB)\n",
- "Collecting ruamel.yaml>=0.16.1\n",
- " Using cached ruamel.yaml-0.16.10-py2.py3-none-any.whl (111 kB)\n",
- "Collecting shortuuid>=0.5.0\n",
- " Using cached shortuuid-1.0.1-py3-none-any.whl (7.5 kB)\n",
- "Collecting shtab<2,>=1.3.0\n",
- " Using cached shtab-1.3.1-py2.py3-none-any.whl (12 kB)\n",
- "Collecting pydot>=1.2.4\n",
- " Using cached pydot-1.4.1-py2.py3-none-any.whl (19 kB)\n",
- "Collecting rich>=3.0.5\n",
- " Using cached rich-5.2.0-py3-none-any.whl (145 kB)\n",
- "Collecting tabulate>=0.8.7\n",
- " Using cached tabulate-0.8.7-py3-none-any.whl (24 kB)\n",
- "Processing /home/alex/.cache/pip/wheels/3c/33/97/805b282e129f60bb4e87cea622338f30b65f21eaf65219971f/funcy-1.14-py2.py3-none-any.whl\n",
- "Processing /home/alex/.cache/pip/wheels/49/68/a0/8e7cb7bbf4990fc10b5a082aa0eb3ac66787ca11e8eca445b2/flufl.lock-3.2-py3-none-any.whl\n",
- "Collecting pyasn1>=0.4.1\n",
- " Using cached pyasn1-0.4.8-py2.py3-none-any.whl (77 kB)\n",
- "Collecting appdirs>=1.4.3\n",
- " Using cached appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)\n",
- "Requirement already satisfied: setuptools>=34.0.0 in ./venv-dvc-3-automate-experiments/lib/python3.7/site-packages (from dvc==1.5.0) (47.1.1)\n",
- "Collecting tqdm<5,>=4.45.0\n",
- " Using cached tqdm-4.48.2-py2.py3-none-any.whl (68 kB)\n",
- "Processing /home/alex/.cache/pip/wheels/bc/f8/ae/bc69cb5f61393ebf9ade4cde41d1a813d35bfe78263a26f99e/dpath-2.0.1-py3-none-any.whl\n",
- "Collecting grandalf==0.6\n",
- " Using cached grandalf-0.6-py3-none-any.whl (31 kB)\n",
- "Processing /home/alex/.cache/pip/wheels/b8/92/aa/456d462c908b4e210c3928f778d28f94049fc9e47af8b191c9/nanotime-0.5.2-py3-none-any.whl\n",
- "Collecting flatten-json<0.1.8,>=0.1.6\n",
- " Using cached flatten_json-0.1.7-py3-none-any.whl (6.4 kB)\n",
- "Processing /home/alex/.cache/pip/wheels/ce/22/5c/bcd55db68399954d13c8d3b23192a517dd59ba3ee8648fa773/pygtrie-2.3.2-py3-none-any.whl\n",
- "Requirement already satisfied: packaging>=19.0 in ./venv-dvc-3-automate-experiments/lib/python3.7/site-packages (from dvc==1.5.0) (20.4)\n",
- "Requirement already satisfied: PyYAML<5.4,>=5.1.2 in ./venv-dvc-3-automate-experiments/lib/python3.7/site-packages (from dvc==1.5.0) (5.3)\n",
- "Processing /home/alex/.cache/pip/wheels/17/a2/0a/00fa5a0d6f271c82fc59be9ae47173bb6e6a462d4361224072/jsonpath_ng-1.5.1-py3-none-any.whl\n",
- "Collecting colorama>=0.3.9\n",
- " Using cached colorama-0.4.3-py2.py3-none-any.whl (15 kB)\n",
- "Collecting toml>=0.10.1\n",
- " Using cached toml-0.10.1-py2.py3-none-any.whl (19 kB)\n",
- "Collecting pathspec>=0.6.0\n",
- " Using cached pathspec-0.8.0-py2.py3-none-any.whl (28 kB)\n",
- "Collecting gitpython>3\n",
- " Using cached GitPython-3.1.7-py3-none-any.whl (158 kB)\n",
- "Collecting networkx<2.5,>=2.1\n",
- " Using cached networkx-2.4-py3-none-any.whl (1.6 MB)\n",
- "Collecting ply>=3.9\n",
- " Using cached ply-3.11-py2.py3-none-any.whl (49 kB)\n",
- "Processing /home/alex/.cache/pip/wheels/0d/c4/19/13d74440f2a571841db6b6e0a273694327498884dafb9cf978/configobj-5.0.6-py3-none-any.whl\n",
- "Collecting distro>=1.3.0\n",
- " Using cached distro-1.5.0-py2.py3-none-any.whl (18 kB)\n",
- "Collecting requests>=2.22.0\n",
- " Using cached requests-2.24.0-py2.py3-none-any.whl (61 kB)\n",
- "Processing /home/alex/.cache/pip/wheels/af/ee/20/047a79ba5ff692baa2f7e2e95c0cd57061a1673d59f5acf0d5/voluptuous-0.11.7-py3-none-any.whl\n",
- "Collecting zc.lockfile>=1.2.1\n",
- " Using cached zc.lockfile-2.0-py2.py3-none-any.whl (9.7 kB)\n",
- "Collecting ruamel.yaml.clib>=0.1.2; platform_python_implementation == \"CPython\" and python_version < \"3.9\"\n",
- " Using cached ruamel.yaml.clib-0.2.0-cp37-cp37m-manylinux1_x86_64.whl (547 kB)\n",
- "Requirement already satisfied: pyparsing>=2.1.4 in ./venv-dvc-3-automate-experiments/lib/python3.7/site-packages (from pydot>=1.2.4->dvc==1.5.0) (2.4.7)\n",
- "Collecting typing-extensions<4.0.0,>=3.7.4\n",
- " Using cached typing_extensions-3.7.4.2-py3-none-any.whl (22 kB)\n",
- "Collecting commonmark<0.10.0,>=0.9.0\n",
- " Using cached commonmark-0.9.1-py2.py3-none-any.whl (51 kB)\n",
- "Requirement already satisfied: pygments<3.0.0,>=2.6.0 in ./venv-dvc-3-automate-experiments/lib/python3.7/site-packages (from rich>=3.0.5->dvc==1.5.0) (2.6.1)\n",
- "Processing /home/alex/.cache/pip/wheels/3e/5d/46/fa3cbde0ab8c53dbdd14658b3a4c97035b8851369ce8e79649/atpublic-2.0-py3-none-any.whl\n",
- "Processing /home/alex/.cache/pip/wheels/8b/99/a0/81daf51dcd359a9377b110a8a886b3895921802d2fc1b2397e/future-0.18.2-cp37-none-any.whl\n",
- "Requirement already satisfied: six in ./venv-dvc-3-automate-experiments/lib/python3.7/site-packages (from packaging>=19.0->dvc==1.5.0) (1.15.0)\n",
- "Requirement already satisfied: decorator in ./venv-dvc-3-automate-experiments/lib/python3.7/site-packages (from jsonpath-ng>=1.5.1->dvc==1.5.0) (4.4.2)\n",
- "Collecting gitdb<5,>=4.0.1\n",
- " Using cached gitdb-4.0.5-py3-none-any.whl (63 kB)\n",
- "Collecting certifi>=2017.4.17\n",
- " Using cached certifi-2020.6.20-py2.py3-none-any.whl (156 kB)\n",
- "Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1\n",
- " Using cached urllib3-1.25.10-py2.py3-none-any.whl (127 kB)\n",
- "Collecting chardet<4,>=3.0.2\n",
- " Using cached chardet-3.0.4-py2.py3-none-any.whl (133 kB)\n",
- "Collecting idna<3,>=2.5\n",
- " Using cached idna-2.10-py2.py3-none-any.whl (58 kB)\n",
- "Collecting smmap<4,>=3.0.1\n",
- " Using cached smmap-3.0.4-py2.py3-none-any.whl (25 kB)\n",
- "Installing collected packages: ruamel.yaml.clib, ruamel.yaml, shortuuid, shtab, pydot, colorama, typing-extensions, commonmark, rich, tabulate, funcy, atpublic, flufl.lock, pyasn1, appdirs, tqdm, dpath, future, grandalf, nanotime, flatten-json, pygtrie, ply, jsonpath-ng, toml, pathspec, smmap, gitdb, gitpython, networkx, configobj, distro, certifi, urllib3, chardet, idna, requests, voluptuous, zc.lockfile, dvc\n",
- " Attempting uninstall: tqdm\n",
- " Found existing installation: tqdm 4.42.0\n",
- " Uninstalling tqdm-4.42.0:\n",
- " Successfully uninstalled tqdm-4.42.0\n",
- "Successfully installed appdirs-1.4.4 atpublic-2.0 certifi-2020.6.20 chardet-3.0.4 colorama-0.4.3 commonmark-0.9.1 configobj-5.0.6 distro-1.5.0 dpath-2.0.1 dvc-1.5.0 flatten-json-0.1.7 flufl.lock-3.2 funcy-1.14 future-0.18.2 gitdb-4.0.5 gitpython-3.1.7 grandalf-0.6 idna-2.10 jsonpath-ng-1.5.1 nanotime-0.5.2 networkx-2.4 pathspec-0.8.0 ply-3.11 pyasn1-0.4.8 pydot-1.4.1 pygtrie-2.3.2 requests-2.24.0 rich-5.2.0 ruamel.yaml-0.16.10 ruamel.yaml.clib-0.2.0 shortuuid-1.0.1 shtab-1.3.1 smmap-3.0.4 tabulate-0.8.7 toml-0.10.1 tqdm-4.48.2 typing-extensions-3.7.4.2 urllib3-1.25.10 voluptuous-0.11.7 zc.lockfile-2.0\n",
- "\u001b[33mWARNING: You are using pip version 20.1.1; however, version 20.2.2 is available.\n",
- "You should consider upgrading via the '/home/alex/Dev/Projects/tutorials/tutorials-dvc/dvc-3-automate-experiments/venv-dvc-3-automate-experiments/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
- ]
- }
- ],
- "source": [
- "!pip install dvc==1.5.0"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Checkout branch `tutorial`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-01T07:32:19.401395Z",
- "start_time": "2020-07-01T07:32:19.271265Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Switched to a new branch 'dvc-tutorial'\r\n"
- ]
- }
- ],
- "source": [
- "!git checkout -b dvc-tutorial"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "ExecuteTime": {
- "end_time": "2019-06-08T11:18:29.199273Z",
- "start_time": "2019-06-08T11:18:29.196865Z"
- }
- },
- "source": [
- "## Initialize DVC\n",
- "\n",
- "References: \n",
- "- https://dvc.org/doc/get-started/initialize "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-01T07:32:22.463407Z",
- "start_time": "2020-07-01T07:32:21.450728Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "You can now commit the changes to git.\n",
- "\n",
- "\u001b[31m+---------------------------------------------------------------------+\n",
- "\u001b[39m\u001b[31m|\u001b[39m \u001b[31m|\u001b[39m\n",
- "\u001b[31m|\u001b[39m DVC has enabled anonymous aggregate usage analytics. \u001b[31m|\u001b[39m\n",
- "\u001b[31m|\u001b[39m Read the analytics documentation (and how to opt-out) here: \u001b[31m|\u001b[39m\n",
- "\u001b[31m|\u001b[39m \u001b[34mhttps://dvc.org/doc/user-guide/analytics\u001b[39m \u001b[31m|\u001b[39m\n",
- "\u001b[31m|\u001b[39m \u001b[31m|\u001b[39m\n",
- "\u001b[31m+---------------------------------------------------------------------+\n",
- "\u001b[39m\n",
- "\u001b[33mWhat's next?\u001b[39m\n",
- "\u001b[33m------------\u001b[39m\n",
- "- Check out the documentation: \u001b[34mhttps://dvc.org/doc\u001b[39m\n",
- "- Get help and share ideas: \u001b[34mhttps://dvc.org/chat\u001b[39m\n",
- "- Star us on GitHub: \u001b[34mhttps://github.com/iterative/dvc\u001b[39m\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "!dvc init"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Commit changes"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-01T07:32:26.446894Z",
- "start_time": "2020-07-01T07:32:26.392814Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[dvc-tutorial f285905] Initialize DVC\n",
- " 6 files changed, 128 insertions(+)\n",
- " create mode 100644 .dvc/.gitignore\n",
- " create mode 100644 .dvc/config\n",
- " create mode 100644 .dvc/plots/confusion.json\n",
- " create mode 100644 .dvc/plots/default.json\n",
- " create mode 100644 .dvc/plots/scatter.json\n",
- " create mode 100644 .dvc/plots/smooth.json\n"
- ]
- }
- ],
- "source": [
- "%%bash\n",
- "\n",
- "git add .\n",
- "git commit -m \"Initialize DVC\""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Build automated pipelines"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Create `data_load` stage\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 94,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:30:35.023136Z",
- "start_time": "2020-07-03T19:30:34.904974Z"
- }
- },
- "outputs": [],
- "source": [
- "!mkdir -p data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 95,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:30:37.406056Z",
- "start_time": "2020-07-03T19:30:35.351794Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Restored stage 'data_load' from run-cache \n",
- "Skipping run, checking out outputs\n",
- "Creating 'dvc.yaml'\n",
- "Adding stage 'data_load' in 'dvc.yaml'\n",
- "Generating lock file 'dvc.lock'\n",
- "\n",
- "To track the changes with git, run:\n",
- "\n",
- "\tgit add dvc.lock dvc.yaml .dvc/.gitignore\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "!dvc run -n data_load \\\n",
- " -d src/data_load.py \\\n",
- " -o data/iris.csv \\\n",
- " -o data/classes.json \\\n",
- " -p data_load \\\n",
- " python src/data_load.py \\\n",
- " --config=params.yaml"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 96,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:30:37.455211Z",
- "start_time": "2020-07-03T19:30:37.433214Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "4.0K\tdata/classes.json\n",
- "4.0K\tdata/cm.csv\n",
- "4.0K\tdata/iris.csv\n",
- "8.0K\tdata/iris_featurized.csv\n",
- "4.0K\tdata/metrics.json\n",
- "8.0K\tdata/model.joblib\n",
- "4.0K\tdata/test.csv\n",
- "8.0K\tdata/train.csv\n"
- ]
- }
- ],
- "source": [
- "%%bash\n",
- "\n",
- "du -sh data/*"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 97,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:30:37.604922Z",
- "start_time": "2020-07-03T19:30:37.479654Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[01;34m.\u001b[00m\r\n",
- "├── README.md\r\n",
- "├── \u001b[01;34mdata\u001b[00m\r\n",
- "│ ├── classes.json\r\n",
- "│ ├── cm.csv\r\n",
- "│ ├── iris.csv\r\n",
- "│ ├── iris_featurized.csv\r\n",
- "│ ├── metrics.json\r\n",
- "│ ├── model.joblib\r\n",
- "│ ├── test.csv\r\n",
- "│ └── train.csv\r\n",
- "├── dvc-3-automate-experiments.ipynb\r\n",
- "├── dvc.lock\r\n",
- "├── dvc.yaml\r\n",
- "├── params.yaml\r\n",
- "├── requirements.txt\r\n",
- "└── \u001b[01;34msrc\u001b[00m\r\n",
- " ├── __init__.py\r\n",
- " ├── data_load.py\r\n",
- " ├── evaluate.py\r\n",
- " ├── featurization.py\r\n",
- " ├── split_dataset.py\r\n",
- " └── train.py\r\n",
- "\r\n",
- "2 directories, 20 files\r\n"
- ]
- }
- ],
- "source": [
- "!tree -I venv-dvc-3-automate-experiments"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## dvc.yaml"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 98,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:30:37.727096Z",
- "start_time": "2020-07-03T19:30:37.609182Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "stages:\r\n",
- " data_load:\r\n",
- " cmd: python src/data_load.py --config=params.yaml\r\n",
- " deps:\r\n",
- " - src/data_load.py\r\n",
- " params:\r\n",
- " - data_load\r\n",
- " outs:\r\n",
- " - data/classes.json\r\n",
- " - data/iris.csv\r\n"
- ]
- }
- ],
- "source": [
- "!cat dvc.yaml"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## params.yaml"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 99,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:30:37.877998Z",
- "start_time": "2020-07-03T19:30:37.755666Z"
- },
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\r\n",
- "data_load:\r\n",
- " raw_data_path: data/iris.csv\r\n",
- " classes_names_path: data/classes.json\r\n",
- "\r\n",
- "featurize:\r\n",
- " features_path: data/iris_featurized.csv\r\n",
- " target_column: target\r\n",
- "\r\n",
- "\r\n",
- "data_split:\r\n",
- " test_size: 0.2\r\n",
- " train_path: data/train.csv\r\n",
- " test_path: data/test.csv\r\n",
- "\r\n",
- "\r\n",
- "train:\r\n",
- " model_path: data/model.joblib\r\n",
- "\r\n",
- "\r\n",
- "evaluate:\r\n",
- " metrics_file: data/metrics.json\r\n",
- " confusion_matrix: data/cm.csv\r\n"
- ]
- }
- ],
- "source": [
- "!cat params.yaml"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Reproduce a pipeline"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 100,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:30:39.781553Z",
- "start_time": "2020-07-03T19:30:37.923002Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Stage 'data_load' didn't change, skipping \n",
- "Data and pipelines are up to date.\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "!dvc repro"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Change params.yaml and reproduce \n",
- "\n",
- "Add a new line into `data_load` section:\n",
- " `dummy_param: dummy_value`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 101,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:30:41.698409Z",
- "start_time": "2020-07-03T19:30:39.807607Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Stage 'data_load' didn't change, skipping \n",
- "Data and pipelines are up to date.\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "!dvc repro"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Build end-to-end Machine Learning pipeline\n",
- "Stages \n",
- "- extract features \n",
- "- split dataset \n",
- "- train \n",
- "- evaluate \n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Add feature extraction stage"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 103,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:30:45.387596Z",
- "start_time": "2020-07-03T19:30:43.388868Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Restored stage 'feature_extraction' from run-cache \n",
- "Skipping run, checking out outputs\n",
- "Adding stage 'feature_extraction' in 'dvc.yaml'\n",
- "Updating lock file 'dvc.lock'\n",
- "\n",
- "To track the changes with git, run:\n",
- "\n",
- "\tgit add dvc.lock dvc.yaml\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "!dvc run -n feature_extraction \\\n",
- " -d src/featurization.py \\\n",
- " -d data/iris.csv \\\n",
- " -o data/iris_featurized.csv \\\n",
- " -p data_load,featurize \\\n",
- " python src/featurization.py \\\n",
- " --config=params.yaml"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 104,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:30:45.561869Z",
- "start_time": "2020-07-03T19:30:45.439521Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "README.md params.yaml\r\n",
- "\u001b[1m\u001b[36mdata\u001b[m\u001b[m requirements.txt\r\n",
- "dvc-3-automate-experiments.ipynb \u001b[1m\u001b[36msrc\u001b[m\u001b[m\r\n",
- "dvc.lock \u001b[1m\u001b[36mvenv-dvc-3-automate-experiments\u001b[m\u001b[m\r\n",
- "dvc.yaml\r\n"
- ]
- }
- ],
- "source": [
- "!ls "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 105,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:30:45.706627Z",
- "start_time": "2020-07-03T19:30:45.585641Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "stages:\r\n",
- " data_load:\r\n",
- " cmd: python src/data_load.py --config=params.yaml\r\n",
- " deps:\r\n",
- " - src/data_load.py\r\n",
- " params:\r\n",
- " - data_load\r\n",
- " outs:\r\n",
- " - data/classes.json\r\n",
- " - data/iris.csv\r\n",
- " feature_extraction:\r\n",
- " cmd: python src/featurization.py --config=params.yaml\r\n",
- " deps:\r\n",
- " - data/iris.csv\r\n",
- " - src/featurization.py\r\n",
- " params:\r\n",
- " - data_load\r\n",
- " - featurize\r\n",
- " outs:\r\n",
- " - data/iris_featurized.csv\r\n"
- ]
- }
- ],
- "source": [
- "!cat dvc.yaml"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 106,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:30:45.745702Z",
- "start_time": "2020-07-03T19:30:45.734321Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " sepal_length | \n",
- " sepal_width | \n",
- " petal_length | \n",
- " petal_width | \n",
- " target | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 5.1 | \n",
- " 3.5 | \n",
- " 1.4 | \n",
- " 0.2 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 4.9 | \n",
- " 3.0 | \n",
- " 1.4 | \n",
- " 0.2 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 4.7 | \n",
- " 3.2 | \n",
- " 1.3 | \n",
- " 0.2 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 4.6 | \n",
- " 3.1 | \n",
- " 1.5 | \n",
- " 0.2 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 5.0 | \n",
- " 3.6 | \n",
- " 1.4 | \n",
- " 0.2 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " sepal_length sepal_width petal_length petal_width target\n",
- "0 5.1 3.5 1.4 0.2 0\n",
- "1 4.9 3.0 1.4 0.2 0\n",
- "2 4.7 3.2 1.3 0.2 0\n",
- "3 4.6 3.1 1.5 0.2 0\n",
- "4 5.0 3.6 1.4 0.2 0"
- ]
- },
- "execution_count": 106,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import pandas as pd\n",
- "\n",
- "features = pd.read_csv('data/iris_featurized.csv')\n",
- "features.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 107,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:30:45.893549Z",
- "start_time": "2020-07-03T19:30:45.763986Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[31m??\u001b[m .dvc/\r\n",
- "\u001b[31m??\u001b[m dvc.lock\r\n",
- "\u001b[31m??\u001b[m dvc.yaml\r\n"
- ]
- }
- ],
- "source": [
- "!git status -s"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 108,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:30:45.961182Z",
- "start_time": "2020-07-03T19:30:45.916816Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[dev 0ae7569] Add stage features_extraction\n",
- " 3 files changed, 56 insertions(+)\n",
- " create mode 100644 .dvc/.gitignore\n",
- " create mode 100644 dvc.lock\n",
- " create mode 100644 dvc.yaml\n"
- ]
- }
- ],
- "source": [
- "%%bash\n",
- "git add .\n",
- "git commit -m \"Add stage features_extraction\""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Add split train/test stage"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 109,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:30:48.044867Z",
- "start_time": "2020-07-03T19:30:45.984594Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Restored stage 'split_dataset' from run-cache \n",
- "Skipping run, checking out outputs\n",
- "Adding stage 'split_dataset' in 'dvc.yaml'\n",
- "Updating lock file 'dvc.lock'\n",
- "\n",
- "To track the changes with git, run:\n",
- "\n",
- "\tgit add dvc.lock dvc.yaml\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "!dvc run -n split_dataset \\\n",
- " -d src/split_dataset.py \\\n",
- " -d data/iris_featurized.csv \\\n",
- " -o data/train.csv \\\n",
- " -o data/test.csv \\\n",
- " -p featurize,data_split \\\n",
- " python src/split_dataset.py \\\n",
- " --config=params.yaml"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 110,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:30:48.186864Z",
- "start_time": "2020-07-03T19:30:48.068177Z"
- },
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "stages:\r\n",
- " data_load:\r\n",
- " cmd: python src/data_load.py --config=params.yaml\r\n",
- " deps:\r\n",
- " - src/data_load.py\r\n",
- " params:\r\n",
- " - data_load\r\n",
- " outs:\r\n",
- " - data/classes.json\r\n",
- " - data/iris.csv\r\n",
- " feature_extraction:\r\n",
- " cmd: python src/featurization.py --config=params.yaml\r\n",
- " deps:\r\n",
- " - data/iris.csv\r\n",
- " - src/featurization.py\r\n",
- " params:\r\n",
- " - data_load\r\n",
- " - featurize\r\n",
- " outs:\r\n",
- " - data/iris_featurized.csv\r\n",
- " split_dataset:\r\n",
- " cmd: python src/split_dataset.py --config=params.yaml\r\n",
- " deps:\r\n",
- " - data/iris_featurized.csv\r\n",
- " - src/split_dataset.py\r\n",
- " params:\r\n",
- " - data_split\r\n",
- " - featurize\r\n",
- " outs:\r\n",
- " - data/test.csv\r\n",
- " - data/train.csv\r\n"
- ]
- }
- ],
- "source": [
- "!cat dvc.yaml"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 111,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:30:48.250249Z",
- "start_time": "2020-07-03T19:30:48.209429Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[dev e39a9d3] Add stage split_dataset\n",
- " 2 files changed, 32 insertions(+)\n"
- ]
- }
- ],
- "source": [
- "%%bash\n",
- "git add .\n",
- "git commit -m \"Add stage split_dataset\""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Add train stage"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 112,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:30:50.298161Z",
- "start_time": "2020-07-03T19:30:48.275068Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Restored stage 'train' from run-cache \n",
- "Skipping run, checking out outputs\n",
- "Adding stage 'train' in 'dvc.yaml'\n",
- "Updating lock file 'dvc.lock'\n",
- "\n",
- "To track the changes with git, run:\n",
- "\n",
- "\tgit add dvc.lock dvc.yaml\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "!dvc run -n train \\\n",
- " -d src/train.py \\\n",
- " -d data/train.csv \\\n",
- " -o data/model.joblib \\\n",
- " -p data_split,train \\\n",
- " python src/train.py \\\n",
- " --config=params.yaml"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 113,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:30:50.444828Z",
- "start_time": "2020-07-03T19:30:50.324345Z"
- },
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "stages:\r\n",
- " data_load:\r\n",
- " cmd: python src/data_load.py --config=params.yaml\r\n",
- " deps:\r\n",
- " - src/data_load.py\r\n",
- " params:\r\n",
- " - data_load\r\n",
- " outs:\r\n",
- " - data/classes.json\r\n",
- " - data/iris.csv\r\n",
- " feature_extraction:\r\n",
- " cmd: python src/featurization.py --config=params.yaml\r\n",
- " deps:\r\n",
- " - data/iris.csv\r\n",
- " - src/featurization.py\r\n",
- " params:\r\n",
- " - data_load\r\n",
- " - featurize\r\n",
- " outs:\r\n",
- " - data/iris_featurized.csv\r\n",
- " split_dataset:\r\n",
- " cmd: python src/split_dataset.py --config=params.yaml\r\n",
- " deps:\r\n",
- " - data/iris_featurized.csv\r\n",
- " - src/split_dataset.py\r\n",
- " params:\r\n",
- " - data_split\r\n",
- " - featurize\r\n",
- " outs:\r\n",
- " - data/test.csv\r\n",
- " - data/train.csv\r\n",
- " train:\r\n",
- " cmd: python src/train.py --config=params.yaml\r\n",
- " deps:\r\n",
- " - data/train.csv\r\n",
- " - src/train.py\r\n",
- " params:\r\n",
- " - data_split\r\n",
- " - train\r\n",
- " outs:\r\n",
- " - data/model.joblib\r\n"
- ]
- }
- ],
- "source": [
- "!cat dvc.yaml"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 114,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:30:50.512656Z",
- "start_time": "2020-07-03T19:30:50.468759Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[dev d084d1b] Add stage train\n",
- " 2 files changed, 28 insertions(+)\n"
- ]
- }
- ],
- "source": [
- "%%bash\n",
- "git add .\n",
- "git commit -m \"Add stage train\""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Add evaluate stage"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 115,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:30:52.746281Z",
- "start_time": "2020-07-03T19:30:50.546074Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Restored stage 'evaluate' from run-cache \n",
- "Skipping run, checking out outputs\n",
- "Adding stage 'evaluate' in 'dvc.yaml'\n",
- "Updating lock file 'dvc.lock'\n",
- "\n",
- "To track the changes with git, run:\n",
- "\n",
- "\tgit add dvc.yaml dvc.lock\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "!dvc run -n evaluate \\\n",
- " -d src/evaluate.py \\\n",
- " -d data/test.csv \\\n",
- " -d data/model.joblib \\\n",
- " -d data/classes.json \\\n",
- " -m data/metrics.json \\\n",
- " --plots data/cm.csv \\\n",
- " -p data_load,data_split,train,evaluate \\\n",
- " python src/evaluate.py \\\n",
- " --config=params.yaml"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 116,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:30:52.886914Z",
- "start_time": "2020-07-03T19:30:52.769527Z"
- },
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "stages:\r\n",
- " data_load:\r\n",
- " cmd: python src/data_load.py --config=params.yaml\r\n",
- " deps:\r\n",
- " - src/data_load.py\r\n",
- " params:\r\n",
- " - data_load\r\n",
- " outs:\r\n",
- " - data/classes.json\r\n",
- " - data/iris.csv\r\n",
- " feature_extraction:\r\n",
- " cmd: python src/featurization.py --config=params.yaml\r\n",
- " deps:\r\n",
- " - data/iris.csv\r\n",
- " - src/featurization.py\r\n",
- " params:\r\n",
- " - data_load\r\n",
- " - featurize\r\n",
- " outs:\r\n",
- " - data/iris_featurized.csv\r\n",
- " split_dataset:\r\n",
- " cmd: python src/split_dataset.py --config=params.yaml\r\n",
- " deps:\r\n",
- " - data/iris_featurized.csv\r\n",
- " - src/split_dataset.py\r\n",
- " params:\r\n",
- " - data_split\r\n",
- " - featurize\r\n",
- " outs:\r\n",
- " - data/test.csv\r\n",
- " - data/train.csv\r\n",
- " train:\r\n",
- " cmd: python src/train.py --config=params.yaml\r\n",
- " deps:\r\n",
- " - data/train.csv\r\n",
- " - src/train.py\r\n",
- " params:\r\n",
- " - data_split\r\n",
- " - train\r\n",
- " outs:\r\n",
- " - data/model.joblib\r\n",
- " evaluate:\r\n",
- " cmd: python src/evaluate.py --config=params.yaml\r\n",
- " deps:\r\n",
- " - data/classes.json\r\n",
- " - data/model.joblib\r\n",
- " - data/test.csv\r",
- "\r\n",
- " - src/evaluate.py\r\n",
- " params:\r\n",
- " - data_load\r\n",
- " - data_split\r\n",
- " - evaluate\r\n",
- " - train\r\n",
- " metrics:\r\n",
- " - data/metrics.json\r\n",
- " plots:\r\n",
- " - data/cm.csv\r\n"
- ]
- }
- ],
- "source": [
- "!cat dvc.yaml"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 117,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:30:52.971253Z",
- "start_time": "2020-07-03T19:30:52.919420Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[dev ecf5bc5] Add stage evaluate\n",
- " 2 files changed, 46 insertions(+)\n"
- ]
- }
- ],
- "source": [
- "%%bash\n",
- "git add .\n",
- "git commit -m \"Add stage evaluate\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-06-28T17:23:10.812463Z",
- "start_time": "2020-06-28T17:23:09.886129Z"
- }
- },
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Experimenting with reproducible pipelines"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## How reproduce experiments?"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "> The most exciting part of DVC is reproducibility.\n",
- ">> Reproducibility is the time you are getting benefits out of DVC instead of spending time defining the ML pipelines.\n",
- "\n",
- "> DVC tracks all the dependencies, which helps you iterate on ML models faster without thinking what was affected by your last change.\n",
- ">> In order to track all the dependencies, DVC finds and reads ALL the DVC-files in a repository and builds a dependency graph (DAG) based on these files.\n",
- "\n",
- "> This is one of the differences between DVC reproducibility and traditional Makefile-like build automation tools (Make, Maven, Ant, Rakefile etc). It was designed in such a way to localize specification of DAG nodes.\n",
- "If you run repro on any created DVC-file from our repository, nothing happens because nothing was changed in the defined pipeline.\n",
- "\n",
- "(c) dvc.org https://dvc.org/doc/tutorial/reproducibility"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 118,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:31:02.889684Z",
- "start_time": "2020-07-03T19:31:00.936546Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Stage 'data_load' didn't change, skipping \n",
- "Stage 'feature_extraction' didn't change, skipping\n",
- "Stage 'split_dataset' didn't change, skipping\n",
- "Stage 'train' didn't change, skipping\n",
- "Stage 'evaluate' didn't change, skipping\n",
- "Data and pipelines are up to date.\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "# Nothing to reproduce\n",
- "!dvc repro"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Experiment 1: Add features\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Create new experiment branch\n",
- "\n",
- "Before editing the code/featurization.py file, please create and checkout a new branch __ratio_features__"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 119,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:31:05.089755Z",
- "start_time": "2020-07-03T19:31:04.832150Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Switched to a new branch 'exp1-ratio-features'\n",
- " dev\u001b[m\n",
- " dvc-tutorial\u001b[m\n",
- "* \u001b[32mexp1-ratio-features\u001b[m\n",
- " master\u001b[m\n"
- ]
- }
- ],
- "source": [
- "# create new branch\n",
- "\n",
- "!git checkout -b exp1-ratio-features\n",
- "!git branch"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Update featurization.py"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "in file __featurization.py__ in function`get_features()` after line \n",
- "\n",
- "```python\n",
- " features = dataset.copy()\n",
- "```\n",
- "\n",
- "add lines:\n",
- "\n",
- "```python\n",
- " features['sepal_length_to_sepal_width'] = features['sepal_length'] / features['sepal_width']\n",
- " features['petal_length_to_petal_width'] = features['petal_length'] / features['petal_width']\n",
- "```"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Reproduce pipeline "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 120,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:31:28.674990Z",
- "start_time": "2020-07-03T19:31:25.527004Z"
- },
- "scrolled": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Stage 'data_load' didn't change, skipping \n",
- "Running stage 'feature_extraction' with command:\n",
- "\tpython src/featurization.py --config=params.yaml\n",
- "Updating lock file 'dvc.lock' \n",
- "\n",
- "Restored stage 'split_dataset' from run-cache\n",
- "Skipping run, checking out outputs\n",
- "Updating lock file 'dvc.lock'\n",
- "\n",
- "Restored stage 'train' from run-cache\n",
- "Skipping run, checking out outputs\n",
- "Updating lock file 'dvc.lock'\n",
- "\n",
- "Restored stage 'evaluate' from run-cache\n",
- "Skipping run, checking out outputs\n",
- "Updating lock file 'dvc.lock'\n",
- "\n",
- "To track the changes with git, run:\n",
- "\n",
- "\tgit add dvc.lock\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "!dvc repro"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 121,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:31:28.713726Z",
- "start_time": "2020-07-03T19:31:28.699701Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " sepal_length | \n",
- " sepal_width | \n",
- " petal_length | \n",
- " petal_width | \n",
- " target | \n",
- " sepal_length_to_sepal_width | \n",
- " petal_length_to_petal_width | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 5.1 | \n",
- " 3.5 | \n",
- " 1.4 | \n",
- " 0.2 | \n",
- " 0 | \n",
- " 1.457143 | \n",
- " 7.0 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 4.9 | \n",
- " 3.0 | \n",
- " 1.4 | \n",
- " 0.2 | \n",
- " 0 | \n",
- " 1.633333 | \n",
- " 7.0 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 4.7 | \n",
- " 3.2 | \n",
- " 1.3 | \n",
- " 0.2 | \n",
- " 0 | \n",
- " 1.468750 | \n",
- " 6.5 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 4.6 | \n",
- " 3.1 | \n",
- " 1.5 | \n",
- " 0.2 | \n",
- " 0 | \n",
- " 1.483871 | \n",
- " 7.5 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 5.0 | \n",
- " 3.6 | \n",
- " 1.4 | \n",
- " 0.2 | \n",
- " 0 | \n",
- " 1.388889 | \n",
- " 7.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " sepal_length sepal_width petal_length petal_width target \\\n",
- "0 5.1 3.5 1.4 0.2 0 \n",
- "1 4.9 3.0 1.4 0.2 0 \n",
- "2 4.7 3.2 1.3 0.2 0 \n",
- "3 4.6 3.1 1.5 0.2 0 \n",
- "4 5.0 3.6 1.4 0.2 0 \n",
- "\n",
- " sepal_length_to_sepal_width petal_length_to_petal_width \n",
- "0 1.457143 7.0 \n",
- "1 1.633333 7.0 \n",
- "2 1.468750 6.5 \n",
- "3 1.483871 7.5 \n",
- "4 1.388889 7.0 "
- ]
- },
- "execution_count": 121,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Check features used in this pipeline\n",
- "\n",
- "import pandas as pd\n",
- "\n",
- "features = pd.read_csv('data/iris_featurized.csv')\n",
- "features.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 122,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:31:28.867945Z",
- "start_time": "2020-07-03T19:31:28.737094Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "On branch exp1-ratio-features\r\n",
- "Changes not staged for commit:\r\n",
- " (use \"git add ...\" to update what will be committed)\r\n",
- " (use \"git restore ...\" to discard changes in working directory)\r\n",
- "\t\u001b[31mmodified: dvc.lock\u001b[m\r\n",
- "\t\u001b[31mmodified: src/featurization.py\u001b[m\r\n",
- "\r\n",
- "no changes added to commit (use \"git add\" and/or \"git commit -a\")\r\n"
- ]
- }
- ],
- "source": [
- "!git status"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 124,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:31:36.736663Z",
- "start_time": "2020-07-03T19:31:35.023151Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Path Metric Value Change \n",
- "data/metrics.json f1_score 0.15385 0.0\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "# Get difference with metric from previous pipeline\n",
- "!dvc metrics diff --all"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 125,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:31:39.838836Z",
- "start_time": "2020-07-03T19:31:39.445353Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[exp1-ratio-features 1fc8ec3] Experiment with new features\n",
- " 3 files changed, 872 insertions(+), 510 deletions(-)\n",
- "fatal: tag 'exp1_ratio_features' already exists\n"
- ]
- }
- ],
- "source": [
- "!git add .\n",
- "!git commit -m \"Experiment with new features\"\n",
- "!git tag -a \"exp1_ratio_features\" -m \"Experiment with new features\""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Experiment 2: Tune Logistic Regression"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Create a new experiment branch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 127,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:32:43.387938Z",
- "start_time": "2020-07-03T19:32:43.131917Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Switched to a new branch 'exp2-tuning-logreg'\n",
- " dev\u001b[m\n",
- " dvc-tutorial\u001b[m\n",
- " exp1-ratio-features\u001b[m\n",
- "* \u001b[32mexp2-tuning-logreg\u001b[m\n",
- " master\u001b[m\n"
- ]
- }
- ],
- "source": [
- "# create new branch for experiment\n",
- "\n",
- "!git checkout -b exp2-tuning-logreg\n",
- "!git branch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 129,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:32:52.254763Z",
- "start_time": "2020-07-03T19:32:50.225661Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Stage 'data_load' didn't change, skipping \n",
- "Stage 'feature_extraction' didn't change, skipping\n",
- "Stage 'split_dataset' didn't change, skipping\n",
- "Stage 'train' didn't change, skipping\n",
- "Stage 'evaluate' didn't change, skipping\n",
- "Data and pipelines are up to date.\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "# Nothing to reproduce since code was checked out by `git checkout`\n",
- "# and data files were checked out by `dvc checkout`\n",
- "!dvc repro"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Tuning parameters\n",
- "\n",
- "in file __train.py__ :\n",
- "\n",
- "replace LogisticRegression params with:\n",
- "\n",
- "```python\n",
- " clf = LogisticRegression(C=0.1, solver='newton-cg', multi_class='multinomial', max_iter=100)\n",
- "```\n",
- "__Note__: here we changed logistic regresssion hyperparameters: C to 0.1\n",
- "\n",
- "\n",
- "https://dvc.org/doc/tutorials/get-started/experiments#tuning-parameters"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Reproduce pipelines"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 130,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:33:22.746410Z",
- "start_time": "2020-07-03T19:33:19.314933Z"
- },
- "scrolled": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Stage 'data_load' didn't change, skipping \n",
- "Stage 'feature_extraction' didn't change, skipping\n",
- "Stage 'split_dataset' didn't change, skipping\n",
- "Running stage 'train' with command:\n",
- "\tpython src/train.py --config=params.yaml\n",
- "Updating lock file 'dvc.lock' \n",
- "\n",
- "Restored stage 'evaluate' from run-cache\n",
- "Skipping run, checking out outputs\n",
- "Updating lock file 'dvc.lock'\n",
- "\n",
- "To track the changes with git, run:\n",
- "\n",
- "\tgit add dvc.lock\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "# re-run pipeline \n",
- "\n",
- "!dvc repro"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 131,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:33:24.945534Z",
- "start_time": "2020-07-03T19:33:24.825464Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{\"f1_score\": 1.0}"
- ]
- }
- ],
- "source": [
- "# Get difference with metric from previous pipeline\n",
- "!cat data/metrics.json"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 134,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:34:06.466000Z",
- "start_time": "2020-07-03T19:34:05.328958Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\tdata/metrics.json: \n",
- "\t\tf1_score: 1.0\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "!dvc metrics show"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 135,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:34:08.160934Z",
- "start_time": "2020-07-03T19:34:06.494683Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Path Metric Value Change \n",
- "data/metrics.json f1_score 1.0 0.84615\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "!dvc metrics diff --all"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Commit"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 137,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:36:15.808072Z",
- "start_time": "2020-07-03T19:36:15.762972Z"
- },
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "On branch exp2-tuning-logreg\n",
- "nothing to commit, working tree clean\n"
- ]
- }
- ],
- "source": [
- "%%bash\n",
- "\n",
- "git add .\n",
- "git commit -m \"Tune model. LogisticRegression. C=0.1\"\n",
- "git tag -a \"exp2_tuning_logreg\" -m \"Tune model. LogisticRegression. C=0.1\""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": []
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Experiment 3: Use SVM"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:14:01.831192Z",
- "start_time": "2020-07-03T19:14:01.829062Z"
- }
- },
- "source": [
- "### Create a new experiment branch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 138,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:36:20.443851Z",
- "start_time": "2020-07-03T19:36:20.187021Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Switched to a new branch 'exp3-svm'\n",
- " dev\u001b[m\n",
- " dvc-tutorial\u001b[m\n",
- " exp1-ratio-features\u001b[m\n",
- " exp2-tuning-logreg\u001b[m\n",
- "* \u001b[32mexp3-svm\u001b[m\n",
- " master\u001b[m\n"
- ]
- }
- ],
- "source": [
- "!git checkout -b exp3-svm\n",
- "!git branch"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Update train.py\n",
- "\n",
- "in file __train.py__ replace line\n",
- "\n",
- "```python\n",
- " clf = LogisticRegression(C=0.1, solver='newton-cg', multi_class='multinomial', max_iter=100)\n",
- "```\n",
- "\n",
- "with line\n",
- "\n",
- "```python\n",
- " clf = SVC(C=0.01, kernel='linear', gamma='scale', degree=5)\n",
- "```\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Reproduce pipeline "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 139,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:36:35.537208Z",
- "start_time": "2020-07-03T19:36:32.544097Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Stage 'data_load' didn't change, skipping \n",
- "Stage 'feature_extraction' didn't change, skipping\n",
- "Stage 'split_dataset' didn't change, skipping\n",
- "Running stage 'train' with command:\n",
- "\tpython src/train.py --config=params.yaml\n",
- "Updating lock file 'dvc.lock' \n",
- "\n",
- "Restored stage 'evaluate' from run-cache\n",
- "Skipping run, checking out outputs\n",
- "Updating lock file 'dvc.lock'\n",
- "\n",
- "To track the changes with git, run:\n",
- "\n",
- "\tgit add dvc.lock\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "!dvc repro"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 140,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:36:38.995561Z",
- "start_time": "2020-07-03T19:36:37.831841Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\tdata/metrics.json: \n",
- "\t\tf1_score: 1.0\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "!dvc metrics show"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 141,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:36:40.521084Z",
- "start_time": "2020-07-03T19:36:40.392754Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "On branch exp3-svm\r\n",
- "Changes not staged for commit:\r\n",
- " (use \"git add ...\" to update what will be committed)\r\n",
- " (use \"git restore ...\" to discard changes in working directory)\r\n",
- "\t\u001b[31mmodified: dvc.lock\u001b[m\r\n",
- "\t\u001b[31mmodified: src/train.py\u001b[m\r\n",
- "\r\n",
- "no changes added to commit (use \"git add\" and/or \"git commit -a\")\r\n"
- ]
- }
- ],
- "source": [
- "!git status"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 142,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:36:41.766798Z",
- "start_time": "2020-07-03T19:36:41.377185Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[exp3-svm 1474ec0] Experiment 3 with SVM estimator\r\n",
- " 2 files changed, 5 insertions(+), 4 deletions(-)\r\n"
- ]
- }
- ],
- "source": [
- "!git add .\n",
- "!git commit -m \"Experiment 3 with SVM estimator\"\n",
- "!git tag -a \"exp3_svm\" -m \"Experiment 3 with SVM estimator\""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Merge best experiment `dvc-tutorial ` branch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 153,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:45:17.537969Z",
- "start_time": "2020-07-03T19:45:17.463715Z"
- },
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Auto-merging src/train.py\n",
- "CONFLICT (content): Merge conflict in src/train.py\n",
- "Auto-merging src/featurization.py\n",
- "CONFLICT (add/add): Merge conflict in dvc.lock\n",
- "Auto-merging dvc.lock\n",
- "Auto-merging dvc-3-automate-experiments.ipynb\n",
- "CONFLICT (content): Merge conflict in dvc-3-automate-experiments.ipynb\n",
- "Automatic merge failed; fix conflicts and then commit the result.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Previous HEAD position was 1474ec0 Experiment 3 with SVM estimator\n",
- "Switched to branch 'dvc-tutorial'\n"
- ]
- },
- {
- "ename": "CalledProcessError",
- "evalue": "Command 'b'\\ngit checkout dvc-tutorial \\ngit merge exp3_svm\\n'' returned non-zero exit status 1.",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mCalledProcessError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_cell_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'bash'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'\\ngit checkout dvc-tutorial \\ngit merge exp3_svm\\n'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
- "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py\u001b[0m in \u001b[0;36mrun_cell_magic\u001b[0;34m(self, magic_name, line, cell)\u001b[0m\n\u001b[1;32m 2350\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuiltin_trap\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2351\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mmagic_arg_s\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2352\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2353\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2354\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/IPython/core/magics/script.py\u001b[0m in \u001b[0;36mnamed_script_magic\u001b[0;34m(line, cell)\u001b[0m\n\u001b[1;32m 140\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 141\u001b[0m \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mscript\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 142\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshebang\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 143\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 144\u001b[0m \u001b[0;31m# write a basic docstring:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m\u001b[0m in \u001b[0;36mshebang\u001b[0;34m(self, line, cell)\u001b[0m\n",
- "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/IPython/core/magic.py\u001b[0m in \u001b[0;36m\u001b[0;34m(f, *a, **k)\u001b[0m\n\u001b[1;32m 185\u001b[0m \u001b[0;31m# but it's overkill for just that one bit of state.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 186\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmagic_deco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 187\u001b[0;31m \u001b[0mcall\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 188\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 189\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcallable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/IPython/core/magics/script.py\u001b[0m in \u001b[0;36mshebang\u001b[0;34m(self, line, cell)\u001b[0m\n\u001b[1;32m 243\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstderr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflush\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 244\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraise_error\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturncode\u001b[0m\u001b[0;34m!=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 245\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mCalledProcessError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturncode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstderr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 246\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_run_script\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mto_close\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;31mCalledProcessError\u001b[0m: Command 'b'\\ngit checkout dvc-tutorial \\ngit merge exp3_svm\\n'' returned non-zero exit status 1."
- ]
- }
- ],
- "source": [
- "%%bash\n",
- "\n",
- "git checkout dvc-tutorial \n",
- "git merge exp3_svm"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Compare experiment"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Compare params "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 147,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:39:20.728429Z",
- "start_time": "2020-07-03T19:39:19.065249Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[0m "
- ]
- }
- ],
- "source": [
- "# Get params diffs \n",
- "\n",
- "!dvc params diff"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 148,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:39:29.288964Z",
- "start_time": "2020-07-03T19:39:27.598159Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Path Param Old New \n",
- "params.yaml data_load.classes_names_path data/classes.json data/classes.json\n",
- "params.yaml data_load.raw_data_path data/iris.csv data/iris.csv\n",
- "params.yaml data_split.test_path data/test.csv data/test.csv\n",
- "params.yaml data_split.test_size 0.2 0.2\n",
- "params.yaml data_split.train_path data/train.csv data/train.csv\n",
- "params.yaml evaluate.confusion_matrix data/cm.csv data/cm.csv\n",
- "params.yaml evaluate.metrics_file data/metrics.json data/metrics.json\n",
- "params.yaml featurize.features_path data/iris_featurized.csv data/iris_featurized.csv\n",
- "params.yaml featurize.target_column target target\n",
- "params.yaml train.model_path data/model.joblib data/model.joblib\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "# Compare parameters with a specific commit, a tag or any revision\n",
- "\n",
- "!dvc params diff --all"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 55,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:09:20.304575Z",
- "start_time": "2020-07-03T19:09:18.649548Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{\"params.yaml\": {\"evaluate.metrics_file\": {\"old\": \"data/metrics.json\", \"new\": \"data/metrics.json\"}, \"featurize.features_path\": {\"old\": \"data/iris_featurized.csv\", \"new\": \"data/iris_featurized.csv\"}, \"data_load.classes_names_path\": {\"old\": \"data/classes.json\", \"new\": \"data/classes.json\"}, \"data_split.test_path\": {\"old\": \"data/test.csv\", \"new\": \"data/test.csv\"}, \"train.model_path\": {\"old\": \"data/model.joblib\", \"new\": \"data/model.joblib\"}, \"featurize.target_column\": {\"old\": \"target\", \"new\": \"target\"}, \"data_load.raw_data_path\": {\"old\": \"data/iris.csv\", \"new\": \"data/iris.csv\"}, \"evaluate.confusion_matrix\": {\"old\": \"data/cm.csv\", \"new\": \"data/cm.csv\"}, \"data_split.test_size\": {\"old\": 0.2, \"new\": 0.2, \"diff\": 0.0}, \"data_split.train_path\": {\"old\": \"data/train.csv\", \"new\": \"data/train.csv\"}}}\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "!dvc params diff --show-json --all"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 56,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:09:27.495017Z",
- "start_time": "2020-07-03T19:09:25.848748Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "| Path | Param | Old | New |\n",
- "|-------------|------------------------------|--------------------------|--------------------------|\n",
- "| params.yaml | data_load.classes_names_path | data/classes.json | data/classes.json |\n",
- "| params.yaml | data_load.raw_data_path | data/iris.csv | data/iris.csv |\n",
- "| params.yaml | data_split.test_path | data/test.csv | data/test.csv |\n",
- "| params.yaml | data_split.test_size | 0.2 | 0.2 |\n",
- "| params.yaml | data_split.train_path | data/train.csv | data/train.csv |\n",
- "| params.yaml | evaluate.confusion_matrix | data/cm.csv | data/cm.csv |\n",
- "| params.yaml | evaluate.metrics_file | data/metrics.json | data/metrics.json |\n",
- "| params.yaml | featurize.features_path | data/iris_featurized.csv | data/iris_featurized.csv |\n",
- "| params.yaml | featurize.target_column | target | target |\n",
- "| params.yaml | train.model_path | data/model.joblib | data/model.joblib |\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "!dvc params diff --show-md --all"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:00:44.847802Z",
- "start_time": "2020-07-03T19:00:44.717758Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[33mcommit 736c92a6eeda6261f528d7a2e2d4db4cb306fa03\u001b[m\u001b[33m (\u001b[m\u001b[1;36mHEAD -> \u001b[m\u001b[1;32mexp2-svm\u001b[m\u001b[33m, \u001b[m\u001b[1;33mtag: exp2_svm\u001b[m\u001b[33m)\u001b[m\r\n",
- "Author: Mikhail \r\n",
- "Date: Fri Jul 3 21:49:25 2020 +0300\r\n",
- "\r\n",
- " Experiment 2 with SVM estimator\r\n",
- "\r\n",
- "\u001b[33mcommit 24f75fdcc9bede20cbecf88697b5d3f8ed56f58c\u001b[m\u001b[33m (\u001b[m\u001b[1;33mtag: exp1_ratio_features\u001b[m\u001b[33m, \u001b[m\u001b[1;32mexp1-ratio-features\u001b[m\u001b[33m)\u001b[m\r\n",
- "Author: Mikhail \r\n",
- "Date: Fri Jul 3 21:48:42 2020 +0300\r\n",
- "\r\n",
- " Experiment with new features\r\n",
- "\r\n",
- "\u001b[33mcommit 34a0bc667f86c3b5e388bef672eb598b8a6a7788\u001b[m\u001b[33m (\u001b[m\u001b[1;32mdvc-tutorial\u001b[m\u001b[33m)\u001b[m\r\n",
- "Author: Mikhail \r\n",
- "Date: Wed Jul 1 10:35:03 2020 +0300\r\n",
- "\r\n",
- " Add stage evaluate\r\n",
- "\r\n",
- "\u001b[33mcommit 4c45a4ff702106d78bbaf8d356e0e95ca268e05b\u001b[m\r\n",
- "Author: Mikhail \r\n",
- "Date: Wed Jul 1 10:34:09 2020 +0300\r\n",
- "\r\n",
- " Add stage train\r\n",
- "\r\n",
- "\u001b[33mcommit f41781d2c4855762c4405636491bc014cc00bd20\u001b[m\r\n",
- "Author: Mikhail \r\n",
- "Date: Wed Jul 1 10:34:00 2020 +0300\r\n",
- "\r\n",
- " Add stage split_dataset\r\n",
- "\r\n",
- "\u001b[33mcommit dbfc854a931baf57ad116f811c2cea39d4fb69a9\u001b[m\r\n",
- "Author: Mikhail \r\n",
- "Date: Wed Jul 1 10:33:51 2020 +0300\r\n",
- "\r\n",
- " Add stage features_extraction\r\n",
- "\r\n",
- "\u001b[33mcommit f2859056db4c53e11ba0593388fddd19018d577b\u001b[m\r\n",
- "Author: Mikhail \r\n",
- "Date: Wed Jul 1 10:32:26 2020 +0300\r\n",
- "\r\n",
- " Initialize DVC\r\n",
- "\r\n",
- "\u001b[33mcommit 1102dc2e3f636b2d37558f95a960c788f3de32ed\u001b[m\u001b[33m (\u001b[m\u001b[1;31morigin/dev\u001b[m\u001b[33m, \u001b[m\u001b[1;32mdev\u001b[m\u001b[33m)\u001b[m\r\n",
- "Merge: 855c61a 92ac211\r\n",
- "Author: Mikhail \r\n",
- "Date: Wed Jul 1 07:22:32 2020 +0000\r\n",
- "\r\n",
- " Merge branch 'update-confusion-matrix' into 'dev'\r\n",
- " \r\n",
- " update confusion matrix\r\n",
- " \r\n",
- " See merge request 7labs.ru/tutorials-dvc/dvc-3-automate-experiments!4\r\n",
- "\r\n",
- "\u001b[33mcommit 92ac211f2139095965d0e26304d2d39003136def\u001b[m\r\n",
- "Author: Alex \r\n",
- "Date: Tue Jun 30 13:08:30 2020 +0900\r\n",
- "\r\n",
- " update confusion matrix\r\n",
- "\r\n",
- "\u001b[33mcommit 855c61ac3f02f8938445fe749846e20d01e0f247\u001b[m\r\n",
- "Merge: 22aeb23 7fbf4d8\r\n",
- "Author: Alexander Kolosov \r\n",
- "Date: Mon Jun 29 08:47:37 2020 +0000\r\n",
- "\r\n",
- " Merge branch 'dev-update-pipelines' into 'dev'\r\n",
- " \r\n",
- " Dev update pipelines\r\n",
- " \r\n",
- " See merge request 7labs.ru/tutorials-dvc/dvc-3-automate-experiments!3\r\n",
- "\r\n",
- "\u001b[33mcommit 7fbf4d8f4e54be947f77dce09191b4f6fbb287f0\u001b[m\r\n",
- "Author: Mikhail \r\n",
- "Date: Mon Jun 29 08:47:37 2020 +0000\r\n",
- "\r\n",
- " Dev update pipelines\r\n",
- "\r\n",
- "\u001b[33mcommit 22aeb23eb6b54f12f11c76a5714dbf6bff5f11f9\u001b[m\r\n",
- "Author: Mikhail \r\n",
- "Date: Sun Jun 28 19:02:29 2020 +0300\r\n",
- "\r\n",
- " Update name of tutorial and notebook\r\n",
- "\r\n",
- "\u001b[33mcommit 110a584e41fa7c140bbaf8130f70d4112e58d1a4\u001b[m\r\n",
- "Merge: 2d7e834 a8d3200\r\n",
- "Author: Mikhail \r\n",
- "Date: Sat Jun 27 07:49:11 2020 +0000\r\n",
- "\r\n",
- " Merge branch 'update-software' into 'dev'\r\n",
- " \r\n",
- " Update software\r\n",
- " \r\n",
- " See merge request 7labs.ru/tutorials-dvc/dvc-3-automate-experiments!2\r\n",
- "\r\n",
- "\u001b[33mcommit a8d3200b8cbffdc4af1c7204710d217e9f685928\u001b[m\r\n",
- "Author: Alex \r\n",
- "Date: Fri Jun 26 17:58:32 2020 +0900\r\n",
- "\r\n",
- " intall toc for jupyter notebook\r\n",
- "\r\n",
- "\u001b[33mcommit 8b042ad196928f9584b4bbce058625896af78d9d\u001b[m\r\n",
- "Author: Alex \r\n",
- "Date: Fri Jun 26 17:58:12 2020 +0900\r\n",
- "\r\n",
- " upgrade dvc\r\n",
- "\r\n",
- "\u001b[33mcommit 2d7e834a6d115d1b47253377b3baaace559e3259\u001b[m\r\n",
- "Author: Alex \r\n",
- "Date: Thu Jun 11 12:53:18 2020 +0900\r\n",
- "\r\n",
- " add data/ to .gitignore\r\n",
- "\r\n",
- "\u001b[33mcommit 8817b3ed1f82ed1c4feb9122d49237b37356e70e\u001b[m\r\n",
- "Author: Alex \r\n",
- "Date: Wed Jun 10 22:56:32 2020 +0900\r\n",
- "\r\n",
- " update Lesson 4.ipynb: append description of dvc plots diff\r\n",
- "\r\n",
- "\u001b[33mcommit a8db726c3f368c39180d61d21f21bf6727db20c0\u001b[m\r\n",
- "Author: Alex \r\n",
- "Date: Wed Jun 10 22:44:00 2020 +0900\r\n",
- "\r\n",
- " update Lesson 4.ipynb: add section for dvc metrics diff and dvc plots\r\n",
- "\r\n",
- "\u001b[33mcommit 77559e316fe6b5fd0a11f27a06fbc9eed1c2b606\u001b[m\r\n",
- "Author: Alex \r\n",
- "Date: Wed Jun 10 22:43:09 2020 +0900\r\n",
- "\r\n",
- " update src/evaluate.py: put metric and confusion matrix in separated files\r\n",
- "\r\n",
- "\u001b[33mcommit a0afac2ff2dc7c5815c72ec3770888b67e5f04e7\u001b[m\r\n",
- "Author: Alex \r\n",
- "Date: Wed Jun 10 12:05:51 2020 +0900\r\n",
- "\r\n",
- " refactor code modules\r\n",
- "\r\n",
- "\u001b[33mcommit 73846297879b1f1be3868c64e73b7d8ad6966b09\u001b[m\r\n",
- "Author: Alex \r\n",
- "Date: Wed Jun 10 12:04:37 2020 +0900\r\n",
- "\r\n",
- " fix Lesson 4.ipynb\r\n",
- "\r\n",
- "\u001b[33mcommit b6ba776f8607c6481e34f8a40af4c23a5cd36990\u001b[m\r\n",
- "Author: Alex \r\n",
- "Date: Tue Jun 9 19:27:13 2020 +0900\r\n",
- "\r\n",
- " create repo structure for lesson 4\r\n"
- ]
- }
- ],
- "source": [
- "# To see the difference between two specific commits, both need to be specified:\n",
- "\n",
- "!git log"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:01:12.974894Z",
- "start_time": "2020-07-03T19:01:11.320625Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[0m "
- ]
- }
- ],
- "source": [
- "\n",
- "!dvc params diff 24f75fdcc9bede20cbecf88697b5d3f8ed56f58c HEAD^"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Show metrics"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 149,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:42:07.828077Z",
- "start_time": "2020-07-03T19:42:06.658092Z"
- },
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\tdata/metrics.json: \n",
- "\t\tf1_score: 1.0\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "# this pipeline metrics \n",
- "\n",
- "!dvc metrics show"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 150,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:42:10.492627Z",
- "start_time": "2020-07-03T19:42:09.201160Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "workspace: \n",
- "\tdata/metrics.json:\n",
- "\t\tf1_score: 1.0\n",
- "dev:\n",
- "\tdata/metrics.json:\n",
- "\t\tf1_score: 1.0\n",
- "dvc-tutorial:\n",
- "\tdata/metrics.json:\n",
- "\t\tf1_score: 0.9305555555555555\n",
- "exp1-ratio-features:\n",
- "\tdata/metrics.json:\n",
- "\t\tf1_score: 0.15384615384615383\n",
- "exp2-tuning-logreg:\n",
- "\tdata/metrics.json:\n",
- "\t\tf1_score: 1.0\n",
- "exp3-svm:\n",
- "\tdata/metrics.json:\n",
- "\t\tf1_score: 1.0\n",
- "exp2_tuning_logreg:\n",
- "\tdata/metrics.json:\n",
- "\t\tf1_score: 1.0\n",
- "exp3_svm:\n",
- "\tdata/metrics.json:\n",
- "\t\tf1_score: 1.0\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "# show all commited pipelines metrics (all branch and tags)\n",
- "\n",
- "!dvc metrics show -a -T"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Compare metrics (get differences)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 151,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:43:27.774038Z",
- "start_time": "2020-07-03T19:43:26.104962Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[0m "
- ]
- }
- ],
- "source": [
- "!dvc metrics diff"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 152,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:44:46.444858Z",
- "start_time": "2020-07-03T19:44:44.738955Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Path Metric Value Change \n",
- "data/metrics.json f1_score 1.0 0.0\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "# --all - list all metrics, even those without changes\n",
- "\n",
- "!dvc metrics diff --all"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "* чтобы сравнить текущую метрики из текущего коммита и из другого, нужно указать другой (old) коммит:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 62,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:11:04.120125Z",
- "start_time": "2020-07-03T19:11:02.460457Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Path Metric Value Change \n",
- "data/metrics.json f1_score 1.0 0.84615\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "# Compare old and new branches\n",
- "\n",
- "\n",
- "!dvc metrics diff exp1-ratio-features exp2-svm"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 61,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:10:59.357203Z",
- "start_time": "2020-07-03T19:10:57.708759Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Path Metric Value Change \n",
- "data/metrics.json f1_score 0.93056 0.77671\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "# Equivalent to `!dvc metrics diff exp1-ratio-features dvc-tutorial`, because dvc-tutorial - current branch\n",
- "\n",
- "!dvc metrics diff exp1-ratio-features"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 157,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:50:29.269796Z",
- "start_time": "2020-07-03T19:50:29.132897Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Switched to branch 'dev'\r\n",
- "Your branch is ahead of 'origin/dev' by 7 commits.\r\n",
- " (use \"git push\" to publish your local commits)\r\n"
- ]
- }
- ],
- "source": [
- "!git checkout dev -f"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "* чтобы выводить не только новую, но и старую метрики, нужно добавить опцию --old"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 154,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:48:02.485718Z",
- "start_time": "2020-07-03T19:48:01.562562Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[31mERROR\u001b[39m: failed to show metrics diff - unable to read: 'dvc.lock', YAML file structure is corrupted: while scanning a simple key\n",
- " in \"\", line 22, column 1\n",
- "could not find expected ':'\n",
- " in \"\", line 23, column 8\n",
- "\n",
- "\u001b[33mHaving any troubles?\u001b[39m Hit us up at \u001b[34mhttps://dvc.org/support\u001b[39m, we are always happy to help!\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "# Use --old to show both old and new metrics vlues \n",
- "\n",
- "!dvc metrics diff --old exp1-ratio-features exp2-svm"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 158,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T19:50:33.253819Z",
- "start_time": "2020-07-03T19:50:31.570404Z"
- },
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "| Path | Metric | Value | Change | \n",
- "|--------|----------|---------|----------|\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "!dvc metrics diff --show-md"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Build Plots\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 165,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T20:08:18.872602Z",
- "start_time": "2020-07-03T20:08:18.869605Z"
- }
- },
- "outputs": [],
- "source": [
- "from IPython.display import IFrame"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Show"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 176,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T20:10:21.387140Z",
- "start_time": "2020-07-03T20:10:20.271263Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "file:///Users/mnrozhkov/dev/dvc/course/dvc-3-automate-experiments/data/plots-show.html\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "!dvc plots show --template confusion \"data/cm.csv\" -x actual -y predicted -o data/plots-show.html"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 177,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T20:10:21.421474Z",
- "start_time": "2020-07-03T20:10:21.416923Z"
- },
- "scrolled": false
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 177,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "IFrame(src='data/plots-show.html', width=500, height=500)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Diff"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 192,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T20:27:04.674839Z",
- "start_time": "2020-07-03T20:27:03.879598Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "file:///Users/mnrozhkov/dev/dvc/course/dvc-3-automate-experiments/data/plots-diff.html\n",
- "\u001b[0m"
- ]
- }
- ],
- "source": [
- "# Build metircs plots for all 3 experiments\n",
- "!dvc plots diff -t confusion -o data/plots-diff.html exp1-ratio-features exp3-svm -x predicted"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 194,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-07-03T20:27:34.434387Z",
- "start_time": "2020-07-03T20:27:34.430369Z"
- },
- "scrolled": false
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 194,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "IFrame(src='data/plots-diff.html', width=1000, height=400)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.2"
- },
- "toc": {
- "base_numbering": 1,
- "nav_menu": {},
- "number_sections": false,
- "sideBar": true,
- "skip_h1_title": false,
- "title_cell": "Table of Contents",
- "title_sidebar": "Contents",
- "toc_cell": false,
- "toc_position": {
- "height": "calc(100% - 180px)",
- "left": "10px",
- "top": "150px",
- "width": "230.953px"
- },
- "toc_section_display": true,
- "toc_window_display": true
- },
- "varInspector": {
- "cols": {
- "lenName": 16,
- "lenType": 16,
- "lenVar": 40
- },
- "kernels_config": {
- "python": {
- "delete_cmd_postfix": "",
- "delete_cmd_prefix": "del ",
- "library": "var_list.py",
- "varRefreshCmd": "print(var_dic_list())"
- },
- "r": {
- "delete_cmd_postfix": ") ",
- "delete_cmd_prefix": "rm(",
- "library": "var_list.r",
- "varRefreshCmd": "cat(var_dic_list()) "
- }
- },
- "types_to_exclude": [
- "module",
- "function",
- "builtin_function_or_method",
- "instance",
- "_Feature"
- ],
- "window_display": false
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/models/.gitignore b/models/.gitignore
new file mode 100644
index 00000000..b722e9e1
--- /dev/null
+++ b/models/.gitignore
@@ -0,0 +1 @@
+!.gitignore
\ No newline at end of file
diff --git a/params.yaml b/params.yaml
index 9bb859cd..933640bd 100644
--- a/params.yaml
+++ b/params.yaml
@@ -1,8 +1,8 @@
-
data_load:
raw_data_path: data/iris.csv
classes_names_path: data/classes.json
+
featurize:
features_path: data/iris_featurized.csv
target_column: target
@@ -15,9 +15,9 @@ data_split:
train:
- model_path: data/model.joblib
+ model_path: models/model.joblib
evaluate:
- metrics_file: data/metrics.json
- confusion_matrix: data/cm.csv
+ metrics_file: reports/metrics.json
+ confusion_matrix: reports/cm.csv
diff --git a/reports/.gitignore b/reports/.gitignore
new file mode 100644
index 00000000..b722e9e1
--- /dev/null
+++ b/reports/.gitignore
@@ -0,0 +1 @@
+!.gitignore
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index d470460a..79a1c05b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,11 +1,10 @@
-joblib==0.15.1
-jupyter==1.0.0
-jupyter_contrib_nbextensions==0.5.1
-ipykernel==5.3.0
-matplotlib==3.1.2
-numpy==1.18.1
-pandas==1.0.0
-pyyaml==5.3
-scikit-learn==0.23.1
-scipy==1.4.1
-tqdm==4.42.0
\ No newline at end of file
+dvc==2.57.2
+joblib==1.2.0
+matplotlib==3.7.1
+numpy==1.24.3
+pandas==2.0.1
+python-box==7.0.1
+pyyaml==6.0
+scikit-learn==1.2.2
+scipy==1.10.1
+tqdm==4.65.0
\ No newline at end of file
diff --git a/src/data_load.py b/src/data_load.py
index b07a8258..04005193 100644
--- a/src/data_load.py
+++ b/src/data_load.py
@@ -2,7 +2,8 @@
import json
from sklearn.datasets import load_iris
from typing import Text
-import yaml
+
+from src.utils import load_config
def data_load(config_path: Text) -> None:
@@ -12,18 +13,16 @@ def data_load(config_path: Text) -> None:
config_path {Text}: path to config
"""
- config = yaml.safe_load(open(config_path))
- raw_data_path = config['data_load']['raw_data_path']
- classes_names_path = config['data_load']['classes_names_path']
+ config = load_config(config_path)
data = load_iris(as_frame=True)
classes_names = data.target_names.tolist()
dataset = data.frame
dataset.columns = [colname.strip(' (cm)').replace(' ', '_') for colname in dataset.columns.tolist()]
- dataset.to_csv(raw_data_path, index=False)
+ dataset.to_csv(config.data_load.raw_data_path, index=False)
- with open(classes_names_path, 'w') as classes_names_file:
+ with open(config.data_load.classes_names_path, 'w') as classes_names_file:
json.dump(obj={'classes_names': classes_names}, fp=classes_names_file)
diff --git a/src/evaluate.py b/src/evaluate.py
index bc0d6098..d9c4e428 100644
--- a/src/evaluate.py
+++ b/src/evaluate.py
@@ -4,7 +4,8 @@
import pandas as pd
from sklearn.metrics import f1_score
from typing import Text
-import yaml
+
+from src.utils import load_config
def evaluate(config_path: Text) -> None:
@@ -13,40 +14,29 @@ def evaluate(config_path: Text) -> None:
config_path {Text}: path to config
"""
- config = yaml.safe_load(open(config_path))
- classes_names_path = config['data_load']['classes_names_path']
- test_dataset_path = config['data_split']['test_path']
- model_path = config['train']['model_path']
- metrics_path = config['evaluate']['metrics_file']
- confusion_matrix_path = config['evaluate']['confusion_matrix']
+ config = load_config(config_path)
- classes = json.load(open(classes_names_path))['classes_names']
+ classes = json.load(open(config.data_load.classes_names_path))['classes_names']
- test_dataset = pd.read_csv(test_dataset_path)
+ test_dataset = pd.read_csv(config.data_split.test_path)
y = test_dataset.loc[:, 'target'].values.astype('float32')
X = test_dataset.drop('target', axis=1).values
- clf = joblib.load(model_path)
+ clf = joblib.load(config.train.model_path)
prediction = clf.predict(X)
f1 = f1_score(y_true=y, y_pred=prediction, average='macro')
json.dump(
obj={'f1_score': f1},
- fp=open(metrics_path, 'w')
+ fp=open(config.evaluate.metrics_file, 'w')
)
- # pd.DataFrame({'actual': y, 'predicted': prediction}).apply(
- # lambda series: series.map(
- # {i: cls_name for i, cls_name in enumerate(classes)}
- # )
- # ).to_csv(confusion_matrix_path, index=False)
-
mapping = {i: cls_name for i, cls_name in enumerate(classes)}
cmdf = pd.DataFrame(
{'actual': y, 'predicted': prediction}
).apply(lambda series: series.map(mapping))
- cmdf.to_csv(confusion_matrix_path, index=False)
+ cmdf.to_csv(config.evaluate.confusion_matrix, index=False)
if __name__ == '__main__':
@@ -56,4 +46,3 @@ def evaluate(config_path: Text) -> None:
args = args_parser.parse_args()
evaluate(config_path=args.config)
-
diff --git a/src/featurization.py b/src/featurization.py
index 2306f34f..9aea31a6 100644
--- a/src/featurization.py
+++ b/src/featurization.py
@@ -1,7 +1,8 @@
import argparse
import pandas as pd
from typing import Text
-import yaml
+
+from src.utils import load_config
def get_features(dataset):
@@ -17,13 +18,11 @@ def featurize(config_path: Text) -> None:
config_path {Text}: path to config
"""
- config = yaml.safe_load(open(config_path))
- raw_data_path = config['data_load']['raw_data_path']
- featurized_dataset_path = config['featurize']['features_path']
+ config = load_config(config_path)
- dataset = pd.read_csv(raw_data_path)
+ dataset = pd.read_csv(config.data_load.raw_data_path)
features = get_features(dataset)
- features.to_csv(featurized_dataset_path, index=False)
+ features.to_csv(config.featurize.features_path, index=False)
if __name__ == '__main__':
diff --git a/src/split_dataset.py b/src/split_dataset.py
index ffd6f119..8b8bd38a 100644
--- a/src/split_dataset.py
+++ b/src/split_dataset.py
@@ -2,7 +2,8 @@
from sklearn.model_selection import train_test_split
import pandas as pd
from typing import Text
-import yaml
+
+from src.utils import load_config
def split_train_test(config_path: Text) -> None:
@@ -11,20 +12,15 @@ def split_train_test(config_path: Text) -> None:
config_path {Text}: path to config
"""
- config = yaml.safe_load(open(config_path))
- featurized_dataset_path = config['featurize']['features_path']
- train_dataset_path = config['data_split']['train_path']
- test_dataset_path = config['data_split']['test_path']
- test_size = config['data_split']['test_size']
-
- dataset = pd.read_csv(featurized_dataset_path)
+ config = load_config(config_path)
+ dataset = pd.read_csv(config.featurize.features_path)
# Split in train/test
-
+ test_size = config.data_split.test_size
df_train, df_test = train_test_split(dataset, test_size=test_size, random_state=42)
- df_train.to_csv(train_dataset_path, index=False)
- df_test.to_csv(test_dataset_path, index=False)
+ df_train.to_csv(config.data_split.train_path, index=False)
+ df_test.to_csv(config.data_split.test_path, index=False)
if __name__ == '__main__':
diff --git a/src/train.py b/src/train.py
index fefd056e..56f39c73 100644
--- a/src/train.py
+++ b/src/train.py
@@ -4,7 +4,8 @@
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from typing import Text
-import yaml
+
+from src.utils import load_config
def train(config_path: Text) -> None:
@@ -13,11 +14,9 @@ def train(config_path: Text) -> None:
config_path {Text}: path to config
"""
- config = yaml.safe_load(open(config_path))
- train_dataset_path = config['data_split']['train_path']
- model_path = config['train']['model_path']
+ config = load_config(config_path)
# Load train set
- train_dataset = pd.read_csv(train_dataset_path)
+ train_dataset = pd.read_csv(config.data_split.train_path)
# Get X and Y
y = train_dataset.loc[:, 'target'].values.astype('float32')
@@ -27,7 +26,7 @@ def train(config_path: Text) -> None:
clf = LogisticRegression(C=0.00001, solver='lbfgs', multi_class='multinomial', max_iter=100)
clf.fit(X, y)
- joblib.dump(clf, model_path)
+ joblib.dump(clf, config.train.model_path)
if __name__ == '__main__':
@@ -36,4 +35,4 @@ def train(config_path: Text) -> None:
args_parser.add_argument('--config', dest='config', required=True)
args = args_parser.parse_args()
- train(config_path=args.config)
\ No newline at end of file
+ train(config_path=args.config)
diff --git a/src/utils.py b/src/utils.py
new file mode 100644
index 00000000..ce4b6a50
--- /dev/null
+++ b/src/utils.py
@@ -0,0 +1,19 @@
+import box
+from typing import Text
+import yaml
+
+
+def load_config(config_path: Text) -> box.ConfigBox:
+ """Loads yaml config in instance of box.ConfigBox.
+ Args:
+ config_path {Text}: path to config
+ Returns:
+ box.ConfigBox
+ """
+
+ with open(config_path) as config_file:
+
+ config = yaml.safe_load(config_file)
+ config = box.ConfigBox(config)
+
+ return config