From 25488975ed3e94189587d6292324c05d9636f15f Mon Sep 17 00:00:00 2001
From: Tomiwa Ademidun <tomiwa@proteinqure.com>
Date: Thu, 20 Feb 2020 10:57:10 -0500
Subject: [PATCH 01/10] load toxicity 21 dataset from molaset from molnet

---
 README.md            |  6 +++++-
 chapter3.py          | 16 ----------------
 molecule_toxicity.py | 13 +++++++++++++
 3 files changed, 18 insertions(+), 17 deletions(-)
 delete mode 100644 chapter3.py
 create mode 100644 molecule_toxicity.py

diff --git a/README.md b/README.md
index d4d40d0..e756875 100644
--- a/README.md
+++ b/README.md
@@ -3,4 +3,8 @@ Code Samples for the book: Deep Learning for the Life Sciences
 
 ## Installation
 
-`conda activate condaenv` 
\ No newline at end of file
+`conda create --name condaenv --file requirements.conda.txt`
+`conda activate condaenv` 
+
+To run interpreter from conda environment:
+`condaenv/bin/python3.7`
\ No newline at end of file
diff --git a/chapter3.py b/chapter3.py
deleted file mode 100644
index 360c7ba..0000000
--- a/chapter3.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import deepchem as dc
-import numpy as np
-
-
-def run():
-    # x =  4 random arrays, each array has 5 elements
-    # y = 1 random array with 4 elements
-    x = np.random.random((4, 5))
-    y = np.random.random((1, 4))
-
-    print('x', x)
-    print('y', y)
-
-
-if __name__ == '__main__':
-    run()
diff --git a/molecule_toxicity.py b/molecule_toxicity.py
new file mode 100644
index 0000000..2850719
--- /dev/null
+++ b/molecule_toxicity.py
@@ -0,0 +1,13 @@
+import deepchem as dc
+import numpy as np
+
+
+def run():
+    # first we must load the Toxicity 21 datasets from molnet (MoleculeNet) unto our local machine
+    tox21_tasks, tox21_datasets, transformers = dc.molnet.load_tox21()
+
+    print(tox21_tasks)
+
+
+if __name__ == '__main__':
+    run()

From cbc36b18db4b22de3c425e58ea7df8b2cdee609e Mon Sep 17 00:00:00 2001
From: ademidun <tomiademidun@gmail.com>
Date: Thu, 5 Mar 2020 11:16:13 -0500
Subject: [PATCH 02/10] added book link to README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e756875..433372b 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # deep-learning-life-sciences-code
-Code Samples for the book: Deep Learning for the Life Sciences
+Code Samples for the book: [Deep Learning for the Life Sciences](https://amzn.to/3audBIt)
 
 ## Installation
 

From 6d96b172029bf4e25cbf7775b86d548540303194 Mon Sep 17 00:00:00 2001
From: ademidun <tomiademidun@gmail.com>
Date: Thu, 5 Mar 2020 17:26:37 -0500
Subject: [PATCH 03/10] added seperate requirements.txt packages for linux-64
 and osx-64 platforms

---
 .gitignore                                    |   3 +-
 README.md                                     |   7 +-
 install.sh                                    |   5 +
 ...onda.txt => requirements.conda.linux64.txt |   0
 requirements.conda.osx64.txt                  | 157 ++++++++++++++++++
 5 files changed, 169 insertions(+), 3 deletions(-)
 create mode 100644 install.sh
 rename requirements.conda.txt => requirements.conda.linux64.txt (100%)
 create mode 100644 requirements.conda.osx64.txt

diff --git a/.gitignore b/.gitignore
index 371299b..bff649f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 .idea/
 env/
-condaenv/
\ No newline at end of file
+condaenv/
+.DS_Store
\ No newline at end of file
diff --git a/README.md b/README.md
index 433372b..697c904 100644
--- a/README.md
+++ b/README.md
@@ -3,8 +3,11 @@ Code Samples for the book: [Deep Learning for the Life Sciences](https://amzn.to
 
 ## Installation
 
-`conda create --name condaenv --file requirements.conda.txt`
-`conda activate condaenv` 
+
+If using linux: `conda create --name condaenv --file requirements.conda.linux64.txt`
+If using Mac OSX: `conda create --name condaenv --file requirements.conda.osx64.txt`
+If using Windows: 🤷🏿‍♂️
+`conda activate ./condaenv` 
 
 To run interpreter from conda environment:
 `condaenv/bin/python3.7`
\ No newline at end of file
diff --git a/install.sh b/install.sh
new file mode 100644
index 0000000..8a55461
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+
+conda create --prefix ./condaenv
+conda activate ./condaenv
+conda install -c deepchem -c rdkit -c conda-forge -c omnia deepchem=2.3.0
\ No newline at end of file
diff --git a/requirements.conda.txt b/requirements.conda.linux64.txt
similarity index 100%
rename from requirements.conda.txt
rename to requirements.conda.linux64.txt
diff --git a/requirements.conda.osx64.txt b/requirements.conda.osx64.txt
new file mode 100644
index 0000000..9bbb6b1
--- /dev/null
+++ b/requirements.conda.osx64.txt
@@ -0,0 +1,157 @@
+# This file may be used to create an environment using:
+# $ conda create --name <env> --file <this file>
+# platform: osx-64
+@EXPLICIT
+https://conda.anaconda.org/conda-forge/osx-64/_py-xgboost-mutex-2.0-cpu_0.tar.bz2
+https://repo.anaconda.com/pkgs/main/osx-64/_tflow_select-2.3.0-mkl.conda
+https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-h0b31af3_2.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/c-ares-1.15.0-h01d97ff_1001.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/ca-certificates-2019.11.28-hecc5488_0.tar.bz2
+https://conda.anaconda.org/omnia/osx-64/fftw3f-3.3.4-2.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/jpeg-9c-h1de35cc_1001.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/libcxx-9.0.1-1.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/libiconv-1.15-h01d97ff_1005.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/libsodium-1.0.17-h01d97ff_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/llvm-openmp-9.0.1-h28b9765_2.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/pandoc-2.9.2-0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/pixman-0.38.0-h01d97ff_1003.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/xz-5.2.4-h1de35cc_1001.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/zlib-1.2.11-h0b31af3_1006.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/blosc-1.17.1-h4a8c4bd_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/expat-2.2.9-h4a8c4bd_2.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/icu-58.2-h0a44026_1000.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/libffi-3.2.1-h6de7cb9_1006.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/libgfortran-4.0.0-2.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/libpng-1.6.37-h2573ce8_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/libprotobuf-3.11.4-hd174df1_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/libxgboost-0.90-h4a8c4bd_4.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/lz4-c-1.8.3-h6de7cb9_1001.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/ncurses-6.1-h0a44026_1002.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/openssl-1.1.1d-h0b31af3_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/pcre-8.44-h4a8c4bd_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/tk-8.6.10-hbbe82c9_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/zeromq-4.3.2-h6de7cb9_2.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/freetype-2.10.0-h24853df_1.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/gettext-0.19.8.1-h46ab8bc_1002.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/hdf5-1.10.5-nompi_h3e39495_1104.tar.bz2
+https://repo.anaconda.com/pkgs/main/osx-64/libboost-1.67.0-hebc422b_4.conda
+https://conda.anaconda.org/conda-forge/osx-64/libopenblas-0.3.8-h3d69b6c_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/libxml2-2.9.9-hd80cff7_2.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/readline-8.0-hcfe32e1_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/zstd-1.4.4-he7fca8b_1.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/fontconfig-2.13.1-h1027ab8_1000.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/libblas-3.8.0-15_openblas.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/libtiff-4.1.0-ha78913b_3.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/sqlite-3.30.1-h93121df_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/libcblas-3.8.0-15_openblas.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/liblapack-3.8.0-15_openblas.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/python-3.7.6-hfa4aa89_4_cpython.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/qt-5.9.7-h93ee506_2.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/appnope-0.1.0-py37_1000.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/astor-0.7.1-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/attrs-19.3.0-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/backcall-0.1.0-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/certifi-2019.11.28-py37_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/chardet-3.0.4-py37_1003.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/cython-0.29.15-py37h4a8c4bd_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/decorator-4.4.2-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/defusedxml-0.6.0-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/entrypoints-0.3-py37_1000.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/gast-0.3.3-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/glib-2.58.3-py37h577aef8_1002.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/idna-2.9-py_1.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/ipython_genutils-0.2.0-py_1.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/kiwisolver-1.1.0-py37ha1b3eb9_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/markupsafe-1.1.1-py37h0b31af3_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/mistune-0.8.4-py37h0b31af3_1000.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/numpy-1.18.1-py37hde6bac1_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/olefile-0.46-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/pandocfilters-1.4.2-py_1.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/parso-0.6.2-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/pickleshare-0.7.5-py37_1000.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/prometheus_client-0.7.1-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/ptyprocess-0.6.0-py_1001.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/pycparser-2.20-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/pyparsing-2.4.6-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/pysocks-1.7.1-py37_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/pytz-2019.3-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/pyzmq-19.0.0-py37h4bf09a9_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/qtpy-1.9.0-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/send2trash-1.5.0-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/sip-4.19.8-py37h0a44026_1000.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/six-1.14.0-py37_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/termcolor-1.1.0-py_2.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/testpath-0.4.4-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/tornado-6.0.4-py37h0b31af3_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.1.8-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/webencodings-0.5.1-py_1.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/werkzeug-1.0.0-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/wrapt-1.12.0-py37h0b31af3_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/zipp-3.1.0-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/absl-py-0.9.0-py37_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/cairo-1.16.0-h0ab9d94_1001.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/cffi-1.14.0-py37h356ff06_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/cycler-0.10.0-py_2.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/dbus-1.13.6-h2f22bb5_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/h5py-2.10.0-nompi_py37h106b333_102.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/importlib_metadata-1.5.0-py37_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/jedi-0.16.0-py37_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/mock-3.0.5-py37_0.tar.bz2
+https://conda.anaconda.org/omnia/osx-64/openmm-7.4.1-py37_cuda101_rc_1.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/pexpect-4.8.0-py37_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/pillow-7.0.0-py37h918e99a_0.tar.bz2
+https://repo.anaconda.com/pkgs/main/osx-64/py-boost-1.67.0-py37h6440ff4_4.conda
+https://conda.anaconda.org/conda-forge/osx-64/pyrsistent-0.15.7-py37h0b31af3_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.8.1-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/scipy-1.4.1-py37h82752d6_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/setuptools-45.2.0-py37_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/terminado-0.8.3-py37_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/traitlets-4.3.3-py37_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/bleach-3.1.1-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/cryptography-2.8-py37hafa8578_1.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/grpcio-1.23.0-py37h8a88325_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/jinja2-2.11.1-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/joblib-0.14.1-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/jsonschema-3.2.0-py37_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/jupyter_core-4.6.3-py37_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/keras-applications-1.0.8-py_1.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/keras-preprocessing-1.1.0-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/markdown-3.2.1-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/matplotlib-base-3.2.0-py37h11da6c2_1.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/networkx-2.4-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/numexpr-2.7.1-py37h4f17bb1_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/pandas-1.0.1-py37h4f17bb1_0.tar.bz2
+https://conda.anaconda.org/omnia/osx-64/pdbfixer-1.6-py37_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/protobuf-3.11.4-py37h4a8c4bd_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/pygments-2.5.2-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/pyqt-5.9.2-py37h2a560b1_4.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/wheel-0.34.2-py_1.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/jupyter_client-6.0.0-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/matplotlib-3.2.0-1.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/nbformat-5.0.4-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/pip-20.0.2-py_2.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/prompt_toolkit-2.0.10-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/pyopenssl-19.1.0-py_1.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/pytables-3.6.1-py37h6f8395a_1.tar.bz2
+https://conda.anaconda.org/rdkit/osx-64/rdkit-2019.09.3.0-py37h65625ec_1.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/scikit-learn-0.22.2.post1-py37h3dc85bc_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/tensorboard-1.14.0-py37_0.tar.bz2
+https://repo.anaconda.com/pkgs/main/osx-64/tensorflow-base-1.14.0-mkl_py37h5a24fda_0.conda
+https://conda.anaconda.org/conda-forge/osx-64/tensorflow-estimator-1.14.0-py37h5ca1d4c_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/ipython-7.13.0-py37h5ca1d4c_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/mdtraj-1.9.3-py37h3e38534_1.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/nbconvert-5.6.1-py37_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/py-xgboost-0.90-py37_4.tar.bz2
+https://conda.anaconda.org/deepchem/noarch/simdna-0.4.2-py_0.tar.bz2
+https://repo.anaconda.com/pkgs/main/osx-64/tensorflow-1.14.0-mkl_py37h085be34_0.conda
+https://conda.anaconda.org/conda-forge/osx-64/urllib3-1.25.7-py37_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/ipykernel-5.1.4-py37h5ca1d4c_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/requests-2.23.0-py37_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/xgboost-0.90-py37h4a8c4bd_4.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/jupyter_console-6.0.0-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/notebook-6.0.3-py37_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/qtconsole-4.7.1-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/osx-64/widgetsnbextension-3.5.1-py37_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/ipywidgets-7.5.1-py_0.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/jupyter-1.0.0-py_2.tar.bz2
+https://conda.anaconda.org/deepchem/osx-64/deepchem-2.3.0-py37_0.tar.bz2

From dc51d48404316ccec24c23cdbf2ff0150195cf11 Mon Sep 17 00:00:00 2001
From: ademidun <tomiademidun@gmail.com>
Date: Sat, 7 Mar 2020 19:52:52 -0500
Subject: [PATCH 04/10] added script explaining tox21 data and training model
 and script for quick start

---
 README.md            |  4 +++-
 molecule_toxicity.py | 41 +++++++++++++++++++++++++++++++++++++++++
 start.sh             |  3 +++
 3 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100644 start.sh

diff --git a/README.md b/README.md
index 697c904..ad7217b 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,9 @@ Code Samples for the book: [Deep Learning for the Life Sciences](https://amzn.to
 If using linux: `conda create --name condaenv --file requirements.conda.linux64.txt`
 If using Mac OSX: `conda create --name condaenv --file requirements.conda.osx64.txt`
 If using Windows: 🤷🏿‍♂️
-`conda activate ./condaenv` 
+
+
+`conda activate ./condaenv` or `source start.sh`
 
 To run interpreter from conda environment:
 `condaenv/bin/python3.7`
\ No newline at end of file
diff --git a/molecule_toxicity.py b/molecule_toxicity.py
index 2850719..ee92505 100644
--- a/molecule_toxicity.py
+++ b/molecule_toxicity.py
@@ -3,11 +3,52 @@
 
 
 def run():
+    """
+    tox21_tasks is a list of chemical assays and our dataset
+    contains training data that will tell us whether a certain molecule binds
+    to one of the molecules in
+    :return:
+    :rtype:
+    """
     # first we must load the Toxicity 21 datasets from molnet (MoleculeNet) unto our local machine
     tox21_tasks, tox21_datasets, transformers = dc.molnet.load_tox21()
 
+
+    # tox21_tasks represent 12 assays or bilogicial targets taht we want to see if our molecule binds to
     print(tox21_tasks)
 
 
+    # train_dataset is 6264 molecules with a feature vector of length 1024
+
+
+    # it has a feature vector Y, for each of the 12 assays
+    train_dataset, valid_dataset, test_dataset = tox21_datasets
+
+    # the w represents the weights and a weight of zero means that no experiment was run
+    # to see if the molecule binds to that assay
+    np.count_nonzero(train_dataset.w == 0)
+
+    #  this is a BalancingTransformer because most of the molecules do not bind to most targets
+    #  so most of the labels are zero and a model always predicting zero could actually work (but it would be useless!)
+    #  BalancingTransformer adjusts dataset's wieghts of individual points so all classes have same total weight
+    #  Loss function won't have systematic preference for one class
+    print(transformers)
+
+def train_model(train_dataset):
+    """
+    Train the model using a multitask classifier because there are multiple labels for each sample
+    :param train_dataset:
+    :type train_dataset:
+    :return:
+    :rtype:
+    """
+
+    # layer_sizes means that we have one hidden layer which has a width of 1,000
+    model = dc.models.MultitaskClassifier(n_tasks=12, n_features=1024, layer_sizes=[100])
+
+    # nb_epoch means that we will divide the data into batches, and do one step of gradient descent for each batch
+    model.fit(train_dataset, nb_epoch=10)
+
+
 if __name__ == '__main__':
     run()
diff --git a/start.sh b/start.sh
new file mode 100644
index 0000000..4f79034
--- /dev/null
+++ b/start.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+conda activate ./condaenv
\ No newline at end of file

From 0103619a141fdad9b7741f3a2cb79150585e8a6f Mon Sep 17 00:00:00 2001
From: ademidun <tomiademidun@gmail.com>
Date: Sat, 14 Mar 2020 20:07:04 -0400
Subject: [PATCH 05/10] add notes and comments to predicting molecule toxicity
 and evaluate dataset using mean ROC AUC

---
 molecule_toxicity.py | 37 ++++++++++++++++++++++++++++++++++---
 start.sh             |  6 +++++-
 2 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/molecule_toxicity.py b/molecule_toxicity.py
index ee92505..e012b92 100644
--- a/molecule_toxicity.py
+++ b/molecule_toxicity.py
@@ -34,21 +34,52 @@ def run():
     #  Loss function won't have systematic preference for one class
     print(transformers)
 
-def train_model(train_dataset):
+    train_model(train_dataset, test_dataset, transformers)
+
+def train_model(train_dataset, test_dataset, transformers):
     """
-    Train the model using a multitask classifier because there are multiple labels for each sample
+    Train the model using a multitask classifier because there are multiple outputs for each sample
+    and evaluate model using the mean ROC AUC.
     :param train_dataset:
     :type train_dataset:
+    :param transformers:
+    :type transformers:
     :return:
     :rtype:
     """
 
+    # this model builds a fully connected network (an MLP)
+    #  since we have 12 assays we're testing for, being able to map to multiple outputs is ideal
     # layer_sizes means that we have one hidden layer which has a width of 1,000
-    model = dc.models.MultitaskClassifier(n_tasks=12, n_features=1024, layer_sizes=[100])
+    model = dc.models.MultitaskClassifier(n_tasks=12, n_features=1024, layer_sizes=[1000])
 
     # nb_epoch means that we will divide the data into batches, and do one step of gradient descent for each batch
     model.fit(train_dataset, nb_epoch=10)
 
+    # how do we know how accurate our model is? we will find the mean ROC AUC score across all tasks
+
+    # What is an ROC AUC score? We are trying to predict the toxicity of the molecules,
+    # Receiver Operating Characteristic, Area Under Curve
+    # If there exists any threshold value where, the true positive rate is 1 and false positive is 0 then score is 1
+    # so we pick a threshold of what is considered a toxic molecule
+    # if we pick a threshold value that's too low, we will say too many safe molecules are toxic (high false positive)
+    # alternatively, if we pick one too igh, we will say that toxic molecules are safe (high false negative)
+    # note on understanding false positive terminology.\:
+    # Imagine a molecule that is actually toxic. "Is this molecule toxic?" "No." We gave a negative response
+    # the answer is relative to what we are testing for, in this case, we are testing if a molecule is toxic
+    # so we are making a tradeoff between high false positive vs high false negative so we use something called
+    # an ROC AUC curve, which graphs the tradeofff between the false positive rate and the true positive rate
+
+    metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)
+
+    # evaluate the performance of this model on the train_dataset using the ROC AUC metric
+
+    train_scores = model.evaluate(train_dataset, [metric], transformers)
+    test_scores = model.evaluate(test_dataset, [metric], transformers)
+
+    # the train scores are higher than our test scores which shows us that our model has been overfit
+    print(f'train_scores: {train_scores}')
+    print(f'test_scores: {test_scores}')
 
 if __name__ == '__main__':
     run()
diff --git a/start.sh b/start.sh
index 4f79034..8fb6b65 100644
--- a/start.sh
+++ b/start.sh
@@ -1,3 +1,7 @@
 #!/usr/bin/env bash
+export PATH="/opt/anaconda3/bin:$PATH"
+conda init bash
+conda activate ./condaenv
 
-conda activate ./condaenv
\ No newline at end of file
+# to go back to normal $PATH
+# source ~/.bash_profile
\ No newline at end of file

From 56e5166a5be6db3db9abd91e31899594a8732fb6 Mon Sep 17 00:00:00 2001
From: ademidun <tomiademidun@gmail.com>
Date: Sun, 15 Mar 2020 15:46:04 -0400
Subject: [PATCH 06/10] put molecule_toxicity into a chapter 3 folder

---
 chapter_3/mnist_digit_recognition.py                   | 0
 molecule_toxicity.py => chapter_3/molecule_toxicity.py | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 chapter_3/mnist_digit_recognition.py
 rename molecule_toxicity.py => chapter_3/molecule_toxicity.py (100%)

diff --git a/chapter_3/mnist_digit_recognition.py b/chapter_3/mnist_digit_recognition.py
new file mode 100644
index 0000000..e69de29
diff --git a/molecule_toxicity.py b/chapter_3/molecule_toxicity.py
similarity index 100%
rename from molecule_toxicity.py
rename to chapter_3/molecule_toxicity.py

From 07cf5d46ef4404a9cd769879f7a14a3797430e3a Mon Sep 17 00:00:00 2001
From: ademidun <tomiademidun@gmail.com>
Date: Sun, 15 Mar 2020 16:32:49 -0400
Subject: [PATCH 07/10] create a script to build a mnist model and add a bash
 script to get MNIST data

---
 .gitignore                           |  4 +-
 chapter_3/get_mnist_data.sh          | 10 +++
 chapter_3/mnist_digit_recognition.py | 97 ++++++++++++++++++++++++++++
 3 files changed, 110 insertions(+), 1 deletion(-)
 create mode 100644 chapter_3/get_mnist_data.sh

diff --git a/.gitignore b/.gitignore
index bff649f..66c9b9d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,6 @@
 .idea/
 env/
 condaenv/
-.DS_Store
\ No newline at end of file
+.DS_Store
+
+chapter_3/MNIST_DATA/
\ No newline at end of file
diff --git a/chapter_3/get_mnist_data.sh b/chapter_3/get_mnist_data.sh
new file mode 100644
index 0000000..d2944ad
--- /dev/null
+++ b/chapter_3/get_mnist_data.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+mkdir MNIST_DATA
+
+cd MNIST_DATA
+
+wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
+wget http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
+wget http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
+wget http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
diff --git a/chapter_3/mnist_digit_recognition.py b/chapter_3/mnist_digit_recognition.py
index e69de29..8193cf3 100644
--- a/chapter_3/mnist_digit_recognition.py
+++ b/chapter_3/mnist_digit_recognition.py
@@ -0,0 +1,97 @@
+"""
+In molecule_toxicity.py we used a premade model class, now we will be creating an architecture from scratch
+The reason we might want to do this is if we are working on a dataset where no predefined architecture exists.
+# todo what is the difference between an architecture and a model?
+
+This works by creating two convolution layerr which is a small square that is a subset of the image.
+Then uses two fully connected layers to predict the digit from the local features.
+
+# todo what does that actually mean? [tk include a diagram]
+"""
+import tensorflow as tf
+from tensorflow.examples.tutorials.mnist import input_data
+
+import deepchem as dc
+import deepchem.models.tensorgraph.layers as layers
+
+
+def create_model():
+    """
+    Create our own MNIST model from scratch
+    :return:
+    :rtype:
+    """
+    mnist = input_data.read_data_sets("MNIST_DATA/", one_hot=True)
+
+    # the layers from deepchem are the building blocks of what we will use to make our deep learning architecture
+
+    # now we wrap our dataset into a NumpyDataset
+
+    train_dataset = dc.data.NumpyDataset(mnist.train.images, mnist.train.labels)
+    test_dataset = dc.data.NumpyDataset(mnist.test.images, mnist.test.labels)
+
+    # we will create a model that will take an input, add multiple layers, where each layer takes input from the
+    # previous layers.
+
+    model = dc.models.TensorGraph(model_dir='mnist')
+
+    # 784 corresponds to an image of size 28 X 28
+    # 10 corresponds to the fact that there are 10 possible digits (0-9)
+    # the None indicates that we can accept any size input (e.g. an empty array or 500 items each with 784 features)
+    # our data is also categorical so we must one hot encode, set single array element to 1 and the rest to 0
+    feature = layers.Feature(shape=(None, 784))
+    labels = layers.Label(shape=(None, 10))
+
+    # in order to apply convolutional layers to our input, we convert flat vector of 785 to 28X28
+    # in_layers means it takes our feature layer as an input
+    make_image = layers.Reshape(shape=(None, 28, 28), in_layers=feature)
+
+    # now that we have reshaped the input, we pass to convolution layers
+
+    conv2d_1 = layers.Conv2D(num_outputs=32, activation_fn=tf.nn.relu, in_layers=make_image)
+
+    conv2d_2 = layers.Conv2D(num_outputs=64, activation_fn=tf.nn.relu, in_layers=conv2d_1)
+
+    # we want to end by applying fully connected (Dense) layers to the outputs of our convolutional layer
+    # but first, we must flatten the layer from a 2d matrix to a 1d vector
+
+    flatten = layers.Flatten(in_layers=conv2d_2)
+    dense1 = layers.Dense(out_channels=1024,activation_fn=tf.nn.relu, in_layers=flatten)
+
+    # note that this is final layer so out_channels of 10 represents the 10 outputs and no activation_fn
+    dense2 = layers.Dense(out_channels=10,activation_fn=None, in_layers=dense1)
+
+    # next we want to connect this output to a loss function, so we can train the output
+
+    # compute the value of loss function for every sample then average of all samples to get final loss (ReduceMean)
+    smce = layers.SoftMaxCrossEntropy(in_layers=[labels, dense2])
+    loss = layers.ReduceMean(in_layers=smce)
+    model.set_loss(loss)
+
+    # for MNIST we want the probability that a given sample represents one of the 10 digits
+    # we can achieve this using a softmax function to get the probabilities, then cross entropy to get the labels
+
+    output = layers.SoftMax(in_layers=dense2)
+    model.add_output(output)
+
+    # if our model takes long to train, reduce nb_epoch to 1
+    model.fit(train_dataset,nb_epoch=10)
+
+    # our metric is accuracy, the fraction of labels that are accurately predicted
+    metric = dc.metrics.Metric(dc.metrics.accuracy_score)
+
+    train_scores = model.evaluate(train_dataset, [metric])
+    test_scores = model.evaluate(test_dataset,[metric])
+
+    print('train_scores', train_scores)
+    print('test_scores', test_scores)
+
+if __name__ == '__main__':
+    create_model()
+
+
+
+
+
+
+

From dd87539d6189456cc6ddd64c73228fa5c8eed68c Mon Sep 17 00:00:00 2001
From: ademidun <tomiademidun@gmail.com>
Date: Sun, 15 Mar 2020 19:17:03 -0400
Subject: [PATCH 08/10] added notes to README.md for Chapter 4: Machine
 Learning for Molecules

---
 .gitignore                           |   3 +-
 README.md                            | 124 ++++++++++++++++++++++++++-
 chapter_3/mnist_digit_recognition.py |   2 +-
 3 files changed, 126 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 66c9b9d..061d9b8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,5 @@ env/
 condaenv/
 .DS_Store
 
-chapter_3/MNIST_DATA/
\ No newline at end of file
+chapter_3/MNIST_DATA/
+chapter_3/mnist/
\ No newline at end of file
diff --git a/README.md b/README.md
index ad7217b..7f34d3c 100644
--- a/README.md
+++ b/README.md
@@ -12,4 +12,126 @@ If using Windows: 🤷🏿‍♂️
 `conda activate ./condaenv` or `source start.sh`
 
 To run interpreter from conda environment:
-`condaenv/bin/python3.7`
\ No newline at end of file
+`condaenv/bin/python3.7`
+
+# Book Notes
+
+# Deep Learning for the Life Sciences
+
+I think that in the near future the biggest techonological improvements in society will come from advances in artificial intelligence and biology. So I've been trying to learn more about the intersection of artificial intelligence and biology. I decided to read the book, Deep Learning for the Life Sciences because it covers both topics.
+
+If you want to become very rich or help a lot of people in the future, I recommend learning data science and biology. More specifically deep learning and genetics. I recently bought a book about deep learning and genetics.
+
+
+## Introduction Life Science is Data Science
+- The book begins by talking about how modern life sciences is increasingly driven by data science and algorithms
+- As I have mentioned earlier, it's not so much that the algorithms we have are more sophisticated, it's that we now have access to smarter computers
+- Put mroe bluntly, the computers have gotten way smarter, we on the other hadm have gotten marginally smarter at best.
+
+## Introduction to Deep Learning
+- deep learning at it's most basic level is simply a function, f() that transforms an input x, into an output, y: y = f(x) [7]
+- The simplest models are linear models of the form y = Mx + b; which is essentially just a straight line [8]
+- These are very limited by the fact that they can't fit most datasets. For example, height distribution of the average person would likely not fit  a linear model
+
+- A solution to this is basically a multilayer perceptron, which is essentially just putting one linear function inside another linear function
+y = M_2(B(M_1x + b_1)) + b_2
+- B is called an activation function and transforms the linear function into a non-linear function
+
+- As you put one of these functions inside another one you create what is called a multilayer perceptron
+- An interesting blog post which explains the first Perceptron/Neural net, Rosenblatt's Perceptron ([blog post](https://towardsdatascience.com/rosenblatts-perceptron-the-very-first-neural-network-37a3ec09038a) (tk find paper not behind Medium paywall), [paper](tk add paper link))
+
+### Training Models (13)
+- To train the algorithm we need a loss function, L(y,y') where Y is the actual output and y' is the target value that we expected to get
+- The loss function basically takes the two values to give us a value for how wrong we are
+- Usually we use the Euclidean distance 
+- Also does anyone know a good way of writing mathematical notation on Github? Maybe I should use latex for this review?
+- For probability distribution you should use  cross entropy (really didn't understand this part)
+
+
+## Chapter 4: Machine Learning For Molecules
+
+- random search is often used for designing interesting molecules
+- How can we find more efficient ways of desigining molecules?
+- The first step is to transform molecules into vectors of numbers, called moelcular featurization
+- This includes things like chemical descriptor vectors, 2D graph representations,
+ 3D electrostatic grid representations, orbital basis function representations and more
+ 
+
+### What is a molecule?
+- How do you know which molecules are present in a given sample?
+- use a mass spectromoter to fire a bunch of electrons at the sample
+- the sample becomes ionized or charged and get propelled by an electric field
+- the different fragments go into different buckets based on their mass-to-charge ratio (m/q ion mass/ion charge)
+- the spread of the different charged fragments is called the spectrum
+- you can then use the ratio of the different fragments in each bucket to figure out which molecule you have
+
+- molecules are dyamic, quantum entities: the atoms in a molecule are always moving (dynamic)
+    - a given molecule can be described in multiple ways (quantum)
+
+### What are Molecular bonds?
+
+ - Covalent bonds are strong bonds formed when two atoms share electrons
+ - it takes a lot of energy to break them
+ - this is what actually defines a molecule, a group of atoms joined by covalent bonds
+ 
+ - non-covalent bonds are not as strong as covalent bonds, 
+ - constantly breaking and reforming, they have a huge effect on determining shape and interaction of molecules
+ - some examples include hydrogen bonds, pi-stacking, salt bridges etc.
+ - but non-covalent bonds are important because most drugs interact with
+ biological molecules in human body through non-covalent interactions
+ 
+ - For example water, H20, two hydrogen atoms are strongly attached to an oxygen atom using a covalent bond and
+ that is what forms a waer molecule
+ 
+ - then different water molecules are attached to other water molecules using a hydrogen bond
+ - this is what makes water the universal solvent
+ 
+ ### Chirality of Molecules
+ - some molecules come in two forms that are mirror images of each other 
+ - a right form "R" form and the left-form "S" form
+ - Many physcial properties are identical for both  and have identical molecular graphs
+ - Important because it is possible for both forms to bind to different proteins in body and prodce various effects
+ - For example, in 1950s thalidomide was prescribed as a sedative for nauseau and morning sickness for pregnant women
+ but only the R form is the sedative, but the S form was a teratogenic that has been shown to cause severe defects
+
+
+### Featurize Molecules
+- SMILES strings are a way of describing molecules using text strings
+- Extended-Connectivity Fingerprints are a way of converting arbitrary length strings into a fixed-length vector
+```python
+import deepchem as dc
+from rdkit import Chem
+smiles = ['C1CCCCC`', '01cc0cc1'] # cyclohexane and dioxane
+mols = [Chem.MolFromSmiles(smile) for smile in smiles]
+feat = dc.feat.CircularFingerprint(size=1024)
+arr = feat.featurize(mols)
+# arr is a 2-by-1024 array containing the fingerprints of the two molecules
+```
+- Chemical fingerprints are vectors of 1s and 0s, indicating the presence or absence of a molecular feature
+- algorithm works by starting to look at each atom individually, then works outwards
+
+- another line of thinking says that use physics of the molecule's structure to describe the molecules
+- this will trypicall work best for problemsthat rely on generic propertis of the molecules 
+and not detailed arrangement of atoms
+```python
+import deepchem as dc
+from rdkit import Chem
+smiles = ['C1CCCCC`', '01cc0cc1'] # cyclohexane and dioxane
+mols = [Chem.MolFromSmiles(smile) for smile in smiles]
+feat = dc.fet.RDKitDescriptors; 
+arr = feat.featurize(mols)
+# arr is a 2-by-111 array containing the properties of the two molecules
+```
+
+### Graph Convolutions
+
+- those previous examples, all required a human that thought of an algorithm that could represent the molecules
+in a way that a computer could understand
+- what if there was a way to feed the graph representation of a molecule into a deep learning architecture and have the
+model figure out the features of the molecule
+- similiar to how a deep learning model can learn about properties of an image without being supervised
+- the limitation is that the calculation is based on the molecular graph, so it doesn't know anything about
+the molecule's conformation
+- so it works best for small, rigid molecules, Chapter 5 looks at methods for large, flexible molecules
+
+ 
\ No newline at end of file
diff --git a/chapter_3/mnist_digit_recognition.py b/chapter_3/mnist_digit_recognition.py
index 8193cf3..3cc524a 100644
--- a/chapter_3/mnist_digit_recognition.py
+++ b/chapter_3/mnist_digit_recognition.py
@@ -75,7 +75,7 @@ def create_model():
     model.add_output(output)
 
     # if our model takes long to train, reduce nb_epoch to 1
-    model.fit(train_dataset,nb_epoch=10)
+    model.fit(train_dataset,nb_epoch=1)
 
     # our metric is accuracy, the fraction of labels that are accurately predicted
     metric = dc.metrics.Metric(dc.metrics.accuracy_score)

From 575462e9801d59529b8e98559093ef7c69182c1b Mon Sep 17 00:00:00 2001
From: ademidun <tomiademidun@gmail.com>
Date: Sun, 15 Mar 2020 19:31:25 -0400
Subject: [PATCH 09/10] add script to predict molecule solubility

---
 chapter_4/predict_solubility.py | 55 +++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 chapter_4/predict_solubility.py

diff --git a/chapter_4/predict_solubility.py b/chapter_4/predict_solubility.py
new file mode 100644
index 0000000..aabd174
--- /dev/null
+++ b/chapter_4/predict_solubility.py
@@ -0,0 +1,55 @@
+"""
+Train a model to predict a molecule's solubility (ability to dissolve in water).
+"""
+import deepchem as dc
+from deepchem.models import GraphConvModel
+from rdkit import Chem
+def create_predict_solubility_model():
+
+    # as explained in the Readme, we will use a GraphConv featurizer
+    # which means that the model will learn from itself what features to use to describe the molecule
+    tasks, datasets, transformers = dc.molnet.load_delaney(featurizer='GraphConv')
+
+    train_dataset, valid_dataset, test_dataset = datasets
+
+    # to reduce overfitting we say that dropout=0.2
+    # this means that 20% of outputs from each layer will randomly be set to 0
+    # n_tasks because there is only one ooutput we are trying to get
+    # mode = 'regression' because we want a continuous variable representing the solubility score
+    # in contrast to the categorical model we built in chapter 3 that is picking one value from a set of options
+    model = GraphConvModel(n_tasks=1, mode='regression', dropout=0.2)
+    model.fit(train_dataset, nb_epoch=100)
+
+    # now we will use the pearson coefficient to measure how well our model does
+    # pearson coefficient is measuring the linear correlation between two variables
+    # todo: why did we pick pearson coefficient?
+    metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
+
+    train_scores = model.evaluate(train_dataset, [metric], transformers)
+    test_scores = model.evaluate(test_dataset, [metric], transformers)
+
+    return model
+
+def run():
+
+    model = create_predict_solubility_model()
+
+    smiles = ['COC(C)(C)CCCC(C)CC=CC(C)=CC(=O)OC(C)C',
+              'CCOC(=O)CC',
+              'CSc1nc(NC(C)C)nc(NC(C)C)n1',
+              'CC(C#C)N(C)C(=O)Nc1ccc(Cl)cc1',
+              'Cc1cc2ccccc2cc1C']
+
+    # in order to run the SMILE strings on our model we need to convert to a format expected by the graph convolution
+    mols = [Chem.MolFromSmiles(smile) for smile in smiles]
+
+    featurizer = dc.feat.ConvMolFeaturizer(mols)
+    x = featurizer.featurize(mols)
+
+    predicted_solubility = model.predict_on_batch(x)
+
+    print(predicted_solubility)
+
+
+if __name__ == '__main__':
+    run()

From f0c27ff18c13b17708830aa312f68f7552809e46 Mon Sep 17 00:00:00 2001
From: ademidun <tomiademidun@gmail.com>
Date: Sun, 15 Mar 2020 20:16:23 -0400
Subject: [PATCH 10/10] add SMARTS strings notes and code

---
 README.md                       | 13 ++++++++++++-
 chapter_4/predict_solubility.py |  4 ++++
 chapter_4/smarts_strings.py     | 31 +++++++++++++++++++++++++++++++
 3 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100644 chapter_4/smarts_strings.py

diff --git a/README.md b/README.md
index 7f34d3c..7cd294d 100644
--- a/README.md
+++ b/README.md
@@ -134,4 +134,15 @@ model figure out the features of the molecule
 the molecule's conformation
 - so it works best for small, rigid molecules, Chapter 5 looks at methods for large, flexible molecules
 
- 
\ No newline at end of file
+ ### SMARTS Strings
+ - [MoleculeNet](http://moleculenet.ai) is a dataset for molecular machine learing
+ - Ranching from low level quantum mechanics interactions between atoms
+ - to highlevel interactions in human body like toxicity and side effects 
+ - SMARTS string are useful if you want to see if atoms in a molecule match a pattern:
+    - Searching a molecular databse to see if a particular substructure exists
+    - aligning a set of molecules on common substructure to improve visualization
+- SMARTS string is like regular expression for regular languages
+- So "foo*.bar" will match "foo.bar" and "foo3.bar"
+- Similarly "CCC" will match sequences of three adjacent aliphatic carbon atoms
+(aliphatic means containing carbon and hydrogen joined together in a straight line)
+- 
\ No newline at end of file
diff --git a/chapter_4/predict_solubility.py b/chapter_4/predict_solubility.py
index aabd174..cf208cb 100644
--- a/chapter_4/predict_solubility.py
+++ b/chapter_4/predict_solubility.py
@@ -28,6 +28,10 @@ def create_predict_solubility_model():
     train_scores = model.evaluate(train_dataset, [metric], transformers)
     test_scores = model.evaluate(test_dataset, [metric], transformers)
 
+    # the train scores are higher than our test scores which shows us that our model has been overfit
+    print(f'train_scores: {train_scores}')
+    print(f'test_scores: {test_scores}')
+
     return model
 
 def run():
diff --git a/chapter_4/smarts_strings.py b/chapter_4/smarts_strings.py
new file mode 100644
index 0000000..ae5e270
--- /dev/null
+++ b/chapter_4/smarts_strings.py
@@ -0,0 +1,31 @@
+"""
+SMARTS strings are used to match a sequence of molecules.
+Similar to regular expressions for natural language.
+"""
+from rdkit import Chem
+from rdkit.Chem.Draw import MolsToGridImage
+
+def get_smart():
+
+    smiles_list = ["CCCC", "CCOCC", "CCNCC", "CCNCC", "CCSCC"]
+
+    mol_list = [Chem.MolFromSmiles(smile) for smile in smiles_list]
+
+    # now let see which SMILES strings match the SMARTS pattern
+
+    # match 3 adjacent aliphatic carbon atoms
+    smarts_query_string = "CCC"
+    query = Chem.MolFromSmarts(smarts_query_string)
+
+    match_list = [mol.GetSubstructMatch(query) for mol in mol_list]
+    img = MolsToGridImage(mols=mol_list, molsPerRow=4, highlightAtomLists=match_list)
+    img.show()
+
+    # natch an aliphatic carbon attached to any atom, attached to another aliphatic carbon
+    query = Chem.MolFromSmarts("C[C,O,N]C")
+    match_list = [mol.GetSubstructMatch(query) for mol in mol_list]
+    img = MolsToGridImage(mols=mol_list, molsPerRow=4, highlightAtomLists=match_list)
+    img.show()
+
+if __name__ == '__main__':
+    get_smart()