|
| 1 | +from joblib import Parallel, delayed |
| 2 | +import multiprocessing |
| 3 | +import scanpy as sc |
| 4 | +import pandas as pd |
| 5 | +import glob |
| 6 | +import os |
| 7 | + |
| 8 | +def soupy_ratio(data_dir, num_cores = -1): |
| 9 | + |
| 10 | + ''' |
| 11 | + Calculates the ratio of gene counts associated to empty droplets to all droplets. |
| 12 | + |
| 13 | + This aims to identify soupy genes, those whose expression ratio is high in the empty droplets. |
| 14 | + A higher score (i.e close to 1), the more likely the gene is soupy. |
| 15 | + |
| 16 | + Parameters: |
| 17 | + - data_dir (int): Folder containing all the libraries associated to a given experiment. |
| 18 | + - num_cores (int): Number of cores to use, default all unless specified. |
| 19 | + |
| 20 | + Parameters: |
| 21 | + - a dataframe with the ratio of soupiness associated to every gene. |
| 22 | + |
| 23 | + ''' |
| 24 | + |
| 25 | + # -- List all subfolders containing raw and filtered matrices |
| 26 | + dirs_dropl = glob.glob(os.path.join(data_dir, '*', 'raw_feature_bc_matrix')) |
| 27 | + dirs_cells = glob.glob(os.path.join(data_dir, '*', 'filtered_feature_bc_matrix')) |
| 28 | + |
| 29 | + # -- Function to load adata and modify cell barcodes |
| 30 | + def load_adata(obj_dir): |
| 31 | + adata = sc.read_10x_mtx(obj_dir) |
| 32 | + adata.obs.index = os.path.basename(obj_dir.split('/')[-2]) + '_' + adata.obs.index |
| 33 | + return adata |
| 34 | + |
| 35 | + |
| 36 | + # -- Set number of cores for parallelization |
| 37 | + if num_cores <= 0: |
| 38 | + num_cores = multiprocessing.cpu_count() |
| 39 | + |
| 40 | + # -- Read all matrices and concateante into a single object for raw and filtered |
| 41 | + if __name__ == "__main__": |
| 42 | + print('Reading raw_feature_bc_matrix objects') |
| 43 | + dropl_holder = Parallel(n_jobs = num_cores)(delayed(load_adata)(obj_dir = i) |
| 44 | + for i in dirs_dropl) |
| 45 | + |
| 46 | + print('Reading filtered_feature_bc_matrix objects') |
| 47 | + cells_holder = Parallel(n_jobs = num_cores)(delayed(load_adata)(obj_dir = i) |
| 48 | + for i in dirs_cells) |
| 49 | + |
| 50 | + # -- Concatenate objets |
| 51 | + adata_dropl = dropl_holder[0].concatenate(dropl_holder[1:], join = 'outer') |
| 52 | + del dropl_holder |
| 53 | + |
| 54 | + adata_cells = cells_holder[0].concatenate(cells_holder[1:], join = 'outer') |
| 55 | + del cells_holder |
| 56 | + |
| 57 | + # -- From droplets remove barcodes associtate to cells |
| 58 | + adata_dropl = adata_dropl[~adata_dropl.obs.index.isin(list(adata_cells.obs.index))] |
| 59 | + |
| 60 | + # -- Calculate number of gene counts |
| 61 | + counts_dropl = adata_dropl.X.sum(axis = 0).A1 |
| 62 | + counts_cells = adata_cells.X.sum(axis = 0).A1 |
| 63 | + |
| 64 | + # -- Join counts in a dataframe and discard genes whose counts is 0 in both objects |
| 65 | + soupy_ratio = pd.DataFrame({'droplets' : counts_dropl, 'cells' : counts_cells}, |
| 66 | + index = adata_dropl.var.index) |
| 67 | + |
| 68 | + # -- Remove genes 0 counts for droplets and cells |
| 69 | + soupy_ratio = soupy_ratio[soupy_ratio[['droplets', 'cells']].sum(axis = 1) != 0] |
| 70 | + |
| 71 | + # -- Calculate ratio of soupiness |
| 72 | + soupy_ratio['soupy_ratio'] = soupy_ratio['droplets']/soupy_ratio[['droplets', 'cells']].sum(axis = 1) |
| 73 | + |
| 74 | + return(soupy_ratio) |
0 commit comments