|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "code", |
| 5 | + "execution_count": 1, |
| 6 | + "metadata": {}, |
| 7 | + "outputs": [], |
| 8 | + "source": [ |
| 9 | + "import os\n", |
| 10 | + "import sys\n", |
| 11 | + "import numpy as np\n", |
| 12 | + "import networkx as nx\n", |
| 13 | + "import scanpy as sc\n", |
| 14 | + "import matplotlib.pyplot as plt\n" |
| 15 | + ] |
| 16 | + }, |
| 17 | + { |
| 18 | + "attachments": {}, |
| 19 | + "cell_type": "markdown", |
| 20 | + "metadata": {}, |
| 21 | + "source": [ |
| 22 | + "## Read publibhed patient SCLC data\n", |
| 23 | + "\n", |
| 24 | + "The dataset is published through the following paper:\n", |
| 25 | + "\n", |
| 26 | + "```\n", |
| 27 | + "Chan JM, Quintanal-Villalonga Á, Gao VR, Xie Y, Allaj V, Chaudhary O, Masilionis I, Egger J, Chow A, Walle T, Mattar M. Signatures of plasticity, metastasis, and immunosuppression in an atlas of human small cell lung cancer. Cancer Cell. 2021 Nov 8;39(11):1479-96.\n", |
| 28 | + "```\n", |
| 29 | + "\n", |
| 30 | + "See [here](https://cellxgene.cziscience.com/collections/62e8f058-9c37-48bc-9200-e767f318a8ec) and [here](https://data.humantumoratlas.org/explore?selectedFilters=%5B%7B%22group%22%3A%22AtlasName%22%2C%22value%22%3A%22HTAN+MSK%22%7D%2C%7B%22value%22%3A%22hdf5%22%2C%22label%22%3A%22hdf5%22%2C%22group%22%3A%22FileFormat%22%2C%22count%22%3A11%2C%22isSelected%22%3Afalse%7D%5D&tab=file#) for the files.\n" |
| 31 | + ] |
| 32 | + }, |
| 33 | + { |
| 34 | + "cell_type": "code", |
| 35 | + "execution_count": 2, |
| 36 | + "metadata": {}, |
| 37 | + "outputs": [], |
| 38 | + "source": [ |
| 39 | + "def filter_rna(rna, nCells=-1, min_genes=100, min_cells=3, min_counts=3, n_top_genes=2000, pct_mito=40):\n", |
| 40 | + " # 1. Filter mitochondrial genes\n", |
| 41 | + " rna.var['mt'] = rna.var_names.str.startswith('MT-')\n", |
| 42 | + " sc.pp.calculate_qc_metrics(rna, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)\n", |
| 43 | + " plt.hist(rna.obs[\"pct_counts_mt\"], bins=100)\n", |
| 44 | + " rna = rna[rna.obs[\"pct_counts_mt\"] < pct_mito]\n", |
| 45 | + " # 2. Filter cells with too few genes\n", |
| 46 | + " sc.pp.filter_cells(rna, min_genes=min_genes)\n", |
| 47 | + " # 3. Filter genes with too few counts, or expressed in too few cells\n", |
| 48 | + " sc.pp.filter_genes(rna, min_counts=min_counts) \n", |
| 49 | + " sc.pp.filter_genes(rna, min_cells=min_cells) \n", |
| 50 | + " if nCells > -1:\n", |
| 51 | + " sc.pp.subsample(rna, n_obs=nCells)\n", |
| 52 | + " # Find highly variable genes and only keep them\n", |
| 53 | + " # Stuart, Tim, et al. \"Comprehensive integration of single-cell data.\" Cell 177.7 (2019): 1888-1902.\n", |
| 54 | + " # 1. Using count data, compute mean and variance of each gene\n", |
| 55 | + " # 2. Fit a loess with span .3 to variance against mean plot (function sigma(mean_j) for gene j)\n", |
| 56 | + " # 3. Compute the regularized z_score as z_ij = (x_ij - mean_j) / sigma(mean_j), then clip by sqrt(N) for N the number fo cells\n", |
| 57 | + " # 4. Rank by regularized z_score\n", |
| 58 | + " sc.pp.highly_variable_genes(rna, n_top_genes=n_top_genes, flavor=\"seurat_v3\", subset=True)\n", |
| 59 | + " rna.layers[\"counts\"] = rna.X.copy()\n", |
| 60 | + " # Normalize, log transform and scale (changes .X)\n", |
| 61 | + " # Normalize each cell by the total counts in that cell, multiplying by median total counts per cell (so all cells have same total counts)\n", |
| 62 | + " sc.pp.normalize_total(rna)\n", |
| 63 | + " sc.pp.log1p(rna)\n", |
| 64 | + " return(rna)\n", |
| 65 | + "\n", |
| 66 | + "\n", |
| 67 | + "def preprocess_rna(rnaDir, rna=None, nCells=-1, **kwargs):\n", |
| 68 | + " \"\"\"\n", |
| 69 | + " Preprocess the RNA data\n", |
| 70 | + " Load, handle genomic coordinates, normalize and scale, compute PCA\n", |
| 71 | + " Only keeps higly variable genes (based on seurat v3 definition)\n", |
| 72 | + " Remove genes with unknown genomic coordinates\n", |
| 73 | + " input:\n", |
| 74 | + " rnaDir: directory containing the RNA data\n", |
| 75 | + " nCells: number of cells to keep (if -1, keep all)\n", |
| 76 | + "\n", |
| 77 | + " output:\n", |
| 78 | + " rna: anndata.AnnData object\n", |
| 79 | + " \"\"\"\n", |
| 80 | + " if rna is None:\n", |
| 81 | + " rna = sc.read_10x_mtx(rnaDir)\n", |
| 82 | + " \n", |
| 83 | + " # Find gene coordinates & remove genes with no coordinates\n", |
| 84 | + " # Minimal filtering...\n", |
| 85 | + " rna = filter_rna(rna=rna, nCells=nCells, **kwargs)\n", |
| 86 | + " return(rna)\n" |
| 87 | + ] |
| 88 | + }, |
| 89 | + { |
| 90 | + "cell_type": "code", |
| 91 | + "execution_count": null, |
| 92 | + "metadata": {}, |
| 93 | + "outputs": [], |
| 94 | + "source": [ |
| 95 | + "outDir = \"/path/to/data/\"\n", |
| 96 | + "adata = sc.read(os.path.join(outDir, \"Ru1322b_6634_4000.h5ad\"))\n", |
| 97 | + "\n", |
| 98 | + "# count the number of cells per 'sample' and sort \n", |
| 99 | + "adata.obs['sample'].value_counts().sort_values(ascending=False)\n", |
| 100 | + "\n", |
| 101 | + "adata.obs['sample'].unique()\n", |
| 102 | + "bdata = adata[adata.obs['sample'].str.contains('Ru1322b')].copy()\n", |
| 103 | + "\n", |
| 104 | + "# Lets keep 4K genes, and all the cells\n", |
| 105 | + "bdata.X = bdata.layers['counts'].copy()\n", |
| 106 | + "b1 = preprocess_rna(rnaDir=None, rna=bdata, gtfPath=None, nCells=-1, min_genes=100, min_cells=3, min_counts=3, n_top_genes=4000, pct_mito=80)\n", |
| 107 | + "b1.obs['labels'] = np.random.choice(['A', 'B', 'C'], size=b1.shape[0])\n", |
| 108 | + "outDir = '/juno/work/shah/users/salehis/projects/rnaseq-pfm/data/sclc_pub/'\n", |
| 109 | + "os.makedirs(outDir, exist_ok=True)\n", |
| 110 | + "os.makedirs(os.path.join(outDir, 'processed'), exist_ok=True)\n", |
| 111 | + "b1.write(os.path.join(outDir, 'Ru1322b_6634_4000.h5ad'))" |
| 112 | + ] |
| 113 | + }, |
| 114 | + { |
| 115 | + "cell_type": "code", |
| 116 | + "execution_count": null, |
| 117 | + "metadata": {}, |
| 118 | + "outputs": [], |
| 119 | + "source": [ |
| 120 | + "# create the train/validation/test split\n", |
| 121 | + "#!./driver.py setup_data -i ../data/sclc_pub/Ru1322b_6634_4000.h5ad -o ../data/sclc_pub/processed -p .2 -f True -l 1" |
| 122 | + ] |
| 123 | + } |
| 124 | + ], |
| 125 | + "metadata": { |
| 126 | + "kernelspec": { |
| 127 | + "display_name": "Python 3.9.13 ('gluere')", |
| 128 | + "language": "python", |
| 129 | + "name": "python3" |
| 130 | + }, |
| 131 | + "language_info": { |
| 132 | + "codemirror_mode": { |
| 133 | + "name": "ipython", |
| 134 | + "version": 3 |
| 135 | + }, |
| 136 | + "file_extension": ".py", |
| 137 | + "mimetype": "text/x-python", |
| 138 | + "name": "python", |
| 139 | + "nbconvert_exporter": "python", |
| 140 | + "pygments_lexer": "ipython3", |
| 141 | + "version": "3.9.13" |
| 142 | + }, |
| 143 | + "orig_nbformat": 4, |
| 144 | + "vscode": { |
| 145 | + "interpreter": { |
| 146 | + "hash": "c995ef188f1ffe598f3620a780dc2f1d2deddf2004bae9b09f2a0d8f831da693" |
| 147 | + } |
| 148 | + } |
| 149 | + }, |
| 150 | + "nbformat": 4, |
| 151 | + "nbformat_minor": 2 |
| 152 | +} |
0 commit comments