blei-lab
diff --git a/‎.gitignore
+66 b/‎.gitignore
+66
diff --git a/‎README.md
+47 b/‎README.md
+47
diff --git a/‎data/README.md
+41 b/‎data/README.md
+41
diff --git a/‎data/ml-100k_0.2_1.0.pkl
5.64 MB b/‎data/ml-100k_0.2_1.0.pkl
5.64 MB
diff --git a/‎data/ml-100k_0.2_1.0_counts.pkl
5.27 MB b/‎data/ml-100k_0.2_1.0_counts.pkl
5.27 MB
diff --git a/‎notebooks/README.md
+30 b/‎notebooks/README.md
+30
diff --git a/‎notebooks/gene_expression_preprocess.ipynb
+152 b/‎notebooks/gene_expression_preprocess.ipynb
+152
@@ -0,0 +1,66 @@
+*.py[cod]
+__pycache__
+
+# Temp files
+.*.sw[po]
+*~
+*.bak
+.DS_Store
+
+# C extensions
+*.so
+
+# Build and package files
+*.egg
+*.egg-info
+.bootstrap
+.build
+.cache
+.eggs
+.env
+.installed.cfg
+.ve
+bin
+build
+develop-eggs
+dist
+eggs
+lib
+lib64
+parts
+pip-wheel-metadata/
+pyvenv*/
+sdist
+var
+venv*/
+wheelhouse
+.venv*/
+
+# Installer logs
+pip-log.txt
+
+# Unit test / coverage reports
+.benchmarks
+.coverage
+.coverage.*
+.pytest
+.pytest_cache/
+.tox
+coverage.xml
+htmlcov
+nosetests.xml
+
+# Translations
+*.mo
+
+# Buildout
+.mr.developer.cfg
+
+# IDE project files
+*.iml
+*.komodoproject
+.idea
+.project
+.pydevproject
+.vscode
+
@@ -0,0 +1,47 @@
+# Supplement for Population priors for matrix factorization
+
+```
+Population Priors for Matrix Factorization.
+Sohrab Salehi, Achille Nazaret, Sohrab P Shah, David Blei
+TMLR (in press).
+```
+
+
+## Organization
+
+
+```
+├── popprior_mf
+├── README.md
+├── data
+├── notebooks
+├── pipelines
+```
+
+Every subdirectory contains a README.md file with more details.
+We will now give a brief description of the contents of each file/directory.
+
+## popprior_mf
+
+The packaged source code, ready to install with `pip`. 
+Please see `popprior_mf/README.md` for details.
+
+
+## data
+
+The preprocessed data for the MovieLens 100K dataset, with the train/validation/test split ready to be used with code.
+
+## notebooks
+
+Jupyter notebooks to preprocess the data.
+
+## pipelines
+
+The `nextflow` files and instructions to recreate all experiments for the manuscript. 
+
+## READMe.md
+
+This file.
+
+
+
@@ -0,0 +1,41 @@
+# Dataset for Population priors for matrix factorization
+
+
+
+This directory contains the processed data for the `MovieLens 100K` dataset. 
+The data has the train/validation/test split. 
+
+
+The other datasets, namely `MovieLens 1M` and `Ru1322b` are too large to be included with the supplement. 
+They can be access as follows:
+
+
+## MovieLens 1M
+
+This dataset is accessible [here](https://grouplens.org/datasets/movielens/1m/).
+
+
+## UserArtists
+
+See [here](https://files.grouplens.org/datasets/hetrec2011/hetrec2011-lastfm-readme.txt).
+
+```
+Cantador, I., Brusilovsky, P., and Kuflik, T. Second workshop on information heterogeneity and fusion in recom-mender systems (hetrec2011). In ACM RecSys, pp. 387–388, 2011.
+```
+
+## Ru1322b
+
+
+Dataset published with the following manuscript:
+
+```
+Chan JM, Quintanal-Villalonga Á, Gao VR, Xie Y, Allaj V, Chaudhary O, Masilionis I, Egger J, Chow A, Walle T, Mattar M. Signatures of plasticity, metastasis, and immunosuppression in an atlas of human small cell lung cancer. Cancer Cell. 2021 Nov 8;39(11):1479-96.
+```
+
+See [here](https://cellxgene.cziscience.com/collections/62e8f058-9c37-48bc-9200-e767f318a8ec) and [here](https://data.humantumoratlas.org/explore?selectedFilters=%5B%7B%22group%22%3A%22AtlasName%22%2C%22value%22%3A%22HTAN+MSK%22%7D%2C%7B%22value%22%3A%22hdf5%22%2C%22label%22%3A%22hdf5%22%2C%22group%22%3A%22FileFormat%22%2C%22count%22%3A11%2C%22isSelected%22%3Afalse%7D%5D&tab=file#) for the files.
+
+
+## GoodBooks
+
+
+Accessed at [here](https://github.com/zygmuntz/goodbooks-10k).
@@ -0,0 +1,30 @@
+# Notebooks
+
+
+Jupyter notebooks to create simulated data (Figure 1), preprocess the data (Figures 2 & 3), and plotting (Figures 1-3 and all figures in the supplement).
+
+
+```
+├── README.md
+├── gene_expression_preprocess.ipynb
+├── movielens_100k_preprocess.ipynb
+├── movielens_1m_preprocess.ipynb
+├── goodbooks.ipynb
+├── goodbooks_preprocess.ipynb
+├── usrers_artists_preprocess.ipynb
+```
+
+
+## Preprocessing
+
+The following notebooks contain code used to preprocess the raw input files:
+
+```
+gene_expression_preprocess.ipynb
+movielens_100k_preprocess.ipynb
+movielens_1m_preprocess.ipynb
+usrers_artists_preprocess.ipynbgoodbooks_preprocess.ipynb
+```
+
+The output of these scripts can then be used to generate the train/validation/test split via the `setup_data` command in the `driver.py` script.
+
 
@@ -0,0 +1,152 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "import numpy as np\n",
+    "import networkx as nx\n",
+    "import scanpy as sc\n",
+    "import matplotlib.pyplot as plt\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Read publibhed patient SCLC data\n",
+    "\n",
+    "The dataset is published through the following paper:\n",
+    "\n",
+    "```\n",
+    "Chan JM, Quintanal-Villalonga Á, Gao VR, Xie Y, Allaj V, Chaudhary O, Masilionis I, Egger J, Chow A, Walle T, Mattar M. Signatures of plasticity, metastasis, and immunosuppression in an atlas of human small cell lung cancer. Cancer Cell. 2021 Nov 8;39(11):1479-96.\n",
+    "```\n",
+    "\n",
+    "See [here](https://cellxgene.cziscience.com/collections/62e8f058-9c37-48bc-9200-e767f318a8ec) and [here](https://data.humantumoratlas.org/explore?selectedFilters=%5B%7B%22group%22%3A%22AtlasName%22%2C%22value%22%3A%22HTAN+MSK%22%7D%2C%7B%22value%22%3A%22hdf5%22%2C%22label%22%3A%22hdf5%22%2C%22group%22%3A%22FileFormat%22%2C%22count%22%3A11%2C%22isSelected%22%3Afalse%7D%5D&tab=file#) for the files.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def filter_rna(rna, nCells=-1, min_genes=100, min_cells=3, min_counts=3, n_top_genes=2000, pct_mito=40):\n",
+    "    # 1. Filter mitochondrial genes\n",
+    "    rna.var['mt'] = rna.var_names.str.startswith('MT-')\n",
+    "    sc.pp.calculate_qc_metrics(rna, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)\n",
+    "    plt.hist(rna.obs[\"pct_counts_mt\"], bins=100)\n",
+    "    rna = rna[rna.obs[\"pct_counts_mt\"] < pct_mito]\n",
+    "    # 2. Filter cells with too few genes\n",
+    "    sc.pp.filter_cells(rna, min_genes=min_genes)\n",
+    "    # 3. Filter genes with too few counts, or expressed in too few cells\n",
+    "    sc.pp.filter_genes(rna, min_counts=min_counts) \n",
+    "    sc.pp.filter_genes(rna, min_cells=min_cells) \n",
+    "    if nCells > -1:\n",
+    "        sc.pp.subsample(rna,  n_obs=nCells)\n",
+    "    # Find highly variable genes and only keep them\n",
+    "    # Stuart, Tim, et al. \"Comprehensive integration of single-cell data.\" Cell 177.7 (2019): 1888-1902.\n",
+    "    # 1. Using count data, compute mean and variance of each gene\n",
+    "    # 2. Fit a loess with span .3 to variance against mean plot (function sigma(mean_j) for gene j)\n",
+    "    # 3. Compute the regularized z_score as z_ij = (x_ij - mean_j) / sigma(mean_j), then clip by sqrt(N) for N the number fo cells\n",
+    "    # 4. Rank by regularized z_score\n",
+    "    sc.pp.highly_variable_genes(rna, n_top_genes=n_top_genes, flavor=\"seurat_v3\", subset=True)\n",
+    "    rna.layers[\"counts\"] = rna.X.copy()\n",
+    "    # Normalize, log transform and scale (changes .X)\n",
+    "    # Normalize each cell by the total counts in that cell, multiplying by median total counts per cell (so all cells have same total counts)\n",
+    "    sc.pp.normalize_total(rna)\n",
+    "    sc.pp.log1p(rna)\n",
+    "    return(rna)\n",
+    "\n",
+    "\n",
+    "def preprocess_rna(rnaDir, rna=None, nCells=-1, **kwargs):\n",
+    "    \"\"\"\n",
+    "    Preprocess the RNA data\n",
+    "    Load, handle genomic coordinates, normalize and scale, compute PCA\n",
+    "    Only keeps higly variable genes (based on seurat v3 definition)\n",
+    "    Remove genes with unknown genomic coordinates\n",
+    "    input:\n",
+    "        rnaDir: directory containing the RNA data\n",
+    "        nCells: number of cells to keep (if -1, keep all)\n",
+    "\n",
+    "    output:\n",
+    "        rna: anndata.AnnData object\n",
+    "    \"\"\"\n",
+    "    if rna is None:\n",
+    "        rna = sc.read_10x_mtx(rnaDir)\n",
+    "    \n",
+    "    # Find gene coordinates & remove genes with no coordinates\n",
+    "    # Minimal filtering...\n",
+    "    rna = filter_rna(rna=rna, nCells=nCells, **kwargs)\n",
+    "    return(rna)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "outDir = \"/path/to/data/\"\n",
+    "adata = sc.read(os.path.join(outDir, \"Ru1322b_6634_4000.h5ad\"))\n",
+    "\n",
+    "# count the number of cells per 'sample' and sort \n",
+    "adata.obs['sample'].value_counts().sort_values(ascending=False)\n",
+    "\n",
+    "adata.obs['sample'].unique()\n",
+    "bdata = adata[adata.obs['sample'].str.contains('Ru1322b')].copy()\n",
+    "\n",
+    "# Lets keep 4K genes, and all the cells\n",
+    "bdata.X = bdata.layers['counts'].copy()\n",
+    "b1 = preprocess_rna(rnaDir=None, rna=bdata, gtfPath=None, nCells=-1, min_genes=100, min_cells=3, min_counts=3, n_top_genes=4000, pct_mito=80)\n",
+    "b1.obs['labels'] = np.random.choice(['A', 'B', 'C'], size=b1.shape[0])\n",
+    "outDir = '/juno/work/shah/users/salehis/projects/rnaseq-pfm/data/sclc_pub/'\n",
+    "os.makedirs(outDir, exist_ok=True)\n",
+    "os.makedirs(os.path.join(outDir, 'processed'), exist_ok=True)\n",
+    "b1.write(os.path.join(outDir, 'Ru1322b_6634_4000.h5ad'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create the train/validation/test split\n",
+    "#!./driver.py setup_data -i ../data/sclc_pub/Ru1322b_6634_4000.h5ad -o ../data/sclc_pub/processed -p .2 -f True -l 1"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.13 ('gluere')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "c995ef188f1ffe598f3620a780dc2f1d2deddf2004bae9b09f2a0d8f831da693"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}