diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e2150e7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,144 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Data +.parquet + +# Slurm logs +slurm-*.out + +# Training logs +logs/ + +# Wandb +wandb/ + +# OS related +**/.DS_Store diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..c925edd --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "HIRID-ICU-Benchmark"] + path = HIRID-ICU-Benchmark + url = git@github.com:ratschlab/HIRID-ICU-Benchmark.git diff --git a/HIRID-ICU-Benchmark b/HIRID-ICU-Benchmark new file mode 160000 index 0000000..bee7700 --- /dev/null +++ b/HIRID-ICU-Benchmark @@ -0,0 +1 @@ +Subproject commit bee770094bf8389920bc09823895b87e09a563dd diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..1120dc0 --- /dev/null +++ b/Makefile @@ -0,0 +1,23 @@ + +PYTHON := $(shell which python) + +# Check types and cleanup codebase +check-types: + @echo "----------- Type Checking -----------" + mypy --config-file ./mypy.ini famews/ tests/ + +format: + @echo "----------- Code Formatting (black) -----------" + $(PYTHON) -m black --config famews/pyproject.toml famews tests + @echo "----------- Import Formatting (isort) -----------" + $(PYTHON) -m isort --settings-path famews/pyproject.toml famews tests + +check-format: + @echo "----------- Check Code Formatting (black) -----------" + $(PYTHON) -m black --config famews/pyproject.toml --check famews tests + @echo "----------- Import Formatting (isort) -----------" + $(PYTHON) -m isort --settings-path famews/pyproject.toml -c famews tests + +test: + @echo "----------- Run Tests (pytest) -----------" + $(PYTHON) -m pytest --cov-report term --cov=famews/famews tests/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..3c233c6 --- /dev/null +++ b/README.md @@ -0,0 +1,77 @@ + +# FAMEWS: a Fairness Auditing tool for Medical Early-Warning Systems + +![FAMEWS Workflow](./data/figures/summary_tool_paper.png) + +## Setup + +This repository depends on the workd done by [Yèche et al. HiRID Benchmark](https://github.com/ratschlab/HIRID-ICU-Benchmark) +to preprocess the HiRID dataset and get it ready for model training, as well as inference and fairness analysis. + +The [HiRID Benchmark](https://github.com/ratschlab/HIRID-ICU-Benchmark) repository with the preprocessing is included as a submodule in this repository. To clone the repository with the submodule, run: + +```bash +git submodule init +git submodule update + +# follow instructions in `HiRID Benchmark` repository to download and preprocess the dataset +# the subsequent steps rely on the different stage outputs defined by Yèche et al. +``` + +### Conda Environment + +A conda environment configuration is provided: `environment_linux.yml`. You can create +the environment with: +``` +conda env create -f environment_linux.yml +``` + +### Code Package + +The `famews` package contains the relevant code components +for the pipeline. Install the package into your environment +with: +``` +pip install -e ./famews +``` + +### Configurations + +We use [Gin Configurations](https://github.com/google/gin-config/tags) to configure the +machine learning pipelines, preprocessing, and evaluation pipelines. Example configurations are in `./config/example`. +If implement a configurable component, please make sure an example config of that component is to be found +in any of the example configurations. + +## Pipeline Overview + +Any task (preprocessing, training, evaluation) is to be run with a script located in +`famews/scripts`. Ideally these scripts invoke a `Pipeline` object, which conists of different +`PipelineStage` objects. + +### Preprocessing + +#### HiRID + +TODO: refer to HiRID submodule + +### ML Training + +#### `DLTrainPipeline` + +A training run is divided into a set of training stages (implemented as `PipelineStage`) +and configured using gin configuration files. + +An individual run can be started similar to the following command: +``` +python -m famews.scripts.train_sequence_model \ + -g ./config/example/lstm_test.gin \ + -l ./logs/lstm_test \ + --seed 1111 \ + --wandb_project test +``` + + + + + + diff --git a/data/figures/summary_tool_paper.png b/data/figures/summary_tool_paper.png new file mode 100644 index 0000000..b9ecc0e Binary files /dev/null and b/data/figures/summary_tool_paper.png differ diff --git a/environment_linux.yml b/environment_linux.yml new file mode 100644 index 0000000..680b5c1 --- /dev/null +++ b/environment_linux.yml @@ -0,0 +1,48 @@ +name: famews + +channels: + - defaults + - nvidia + - pytorch + - conda-forge + +dependencies: + - python=3.9 + - pytorch::pytorch-cuda=11.7 + - pytorch::pytorch>=2.0.0 + - pytorch::torchvision>=0.14.1 + - pytorch::torchaudio>=0.13.1 + - pytorch-lightning>=1.9.3,<2.0.0 + - numpy=1.23 + - pandas<2.0 + - scikit-learn>=1.2.0 + - jupyter>=1.0.0 + - jupyterlab>=3.5.3 + - seaborn>=0.12.2 + - matplotlib>=3.6.2 + - pytest==7.1.2 + - pytest-cov==3.0.0 + - codecov==2.1.11 + - mypy==0.981 + - pip>=21.0.1 + - tqdm>=4.64 + - pyarrow>=8.0.0 + - pytables>=3.7.0 + - black==22.6.0 + - isort==5.9.3 + - pyspark>=3.2.1 + - conda-lock==1.4.0 + - gin-config>=0.5.0 + - tensorboard>=2.11.0 + - coolname>=2.2.0 + - wandb>=0.14.0 + - lightgbm>=3.3.5 + - plotly>=5.9.0 + - shap>=0.41.0 + - pip: + - pathos>=0.2.9 + - scikit-fda>=0.8.1 + - coloredlogs>=15.0 + - types-PyYAML + - reportlab>=4.0.4 + - rbo>=0.1.3 \ No newline at end of file diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000..c9b77ad --- /dev/null +++ b/mypy.ini @@ -0,0 +1,8 @@ +[mypy] +ignore_missing_imports = True +no_implicit_optional = False + +; disallow_untyped_defs = True + +[mypy-torch] +follow_imports = skip \ No newline at end of file