forked from Hoohm/dropSeqPipe
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnakefile
141 lines (119 loc) · 6.14 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import pandas as pd
import os
import re
# Load configuration file
configfile: "config.yaml"
# Get sample names from samples.csv
samples = pd.read_table("samples.csv", header=0, sep=',', index_col=0)
# Get read_lengths from samples.csv
read_lengths = list(samples.loc[:,'read_length'])
# Constraint sample names wildcards
wildcard_constraints:
sample="({})".format("|".join(samples.index))
# Create reference files prefixes
reference_prefix = os.path.join(config['META']['reference-directory'], re.split(".fasta|.fa",config['META']['reference-file'])[0])
annotation_prefix = os.path.join(config['META']['reference-directory'],config['META']['annotation-file'].split('.gtf')[0])
reference_file = os.path.join(config['META']['reference-directory'], config['META']['reference-file'])
annotation_file = os.path.join(config['META']['reference-directory'], config['META']['annotation-file'])
annotation_reduced_file = os.path.join(config['META']['reference-directory'],'.'.join([config['META']['annotation-file'].split('.gtf')[0],'reduced','gtf']))
star_index_prefix = os.path.join(config['META']['reference-directory'],'STAR_INDEX/SA')
# Get barcode length
starttrim_length = config['FILTER']['cell-barcode']['end'] - config['FILTER']['cell-barcode']['start'] + 1
rule all:
input:
#meta
'{}.refFlat'.format(annotation_prefix),
'{}.reduced.gtf'.format(annotation_prefix),
'{}.dict'.format(reference_prefix),
'{}.rRNA.intervals'.format(reference_prefix),
expand('{star_index_prefix}_{read_length}/SA', star_index_prefix=star_index_prefix, read_length=read_lengths),
#qc
expand('logs/{sample}_R1_fastqc.html', sample=samples.index),
expand('logs/{sample}_R2_fastqc.html', sample=samples.index),
'reports/fastqc_reads.html',
'reports/fastqc_barcodes.html',
'reports/fastqc_reads_data/multiqc_general_stats.txt',
#filter
expand('data/{sample}_filtered.fastq.gz', sample=samples.index),
expand('plots/{sample}_polya_trimmed.pdf', sample=samples.index),
expand('plots/{sample}_start_trim.pdf', sample=samples.index),
expand('plots/{sample}_CELL_dropped.pdf', sample=samples.index),
expand('plots/{sample}_UMI_dropped.pdf', sample=samples.index),
'plots/BC_drop.pdf',
'reports/filter.html',
#mapping
expand('data/{sample}_final.bam', sample=samples.index),
expand('logs/{sample}_hist_out_cell.txt', sample=samples.index),
expand('plots/{sample}_knee_plot.pdf', sample=samples.index),
'reports/star.html',
'plots/yield.pdf',
#extract
expand('logs/{sample}_umi_per_gene.tsv', sample=samples.index),
expand('plots/{sample}_rna_metrics.pdf', sample=samples.index),
'summary/umi_expression_matrix.tsv',
'summary/counts_expression_matrix.tsv'
rule meta:
input:
'{}.refFlat'.format(annotation_prefix),
'{}.reduced.gtf'.format(annotation_prefix),
'{}.dict'.format(reference_prefix),
'{}.rRNA.intervals'.format(reference_prefix),
expand('{star_index_prefix}_{read_length}/SA', star_index_prefix=star_index_prefix, read_length=read_lengths)
rule qc:
input:
expand('logs/{sample}_R1_fastqc.html', sample=samples.index),
expand('logs/{sample}_R2_fastqc.html', sample=samples.index),
'reports/fastqc_reads.html',
'reports/fastqc_barcodes.html',
'reports/fastqc_reads_data/multiqc_general_stats.txt'
rule filter:
input:
expand('data/{sample}_filtered.fastq.gz', sample=samples.index),
expand('plots/{sample}_polya_trimmed.pdf', sample=samples.index),
expand('plots/{sample}_start_trim.pdf', sample=samples.index),
expand('plots/{sample}_CELL_dropped.pdf', sample=samples.index),
expand('plots/{sample}_UMI_dropped.pdf', sample=samples.index),
'reports/filter.html',
'plots/BC_drop.pdf'
rule map:
input:
expand('data/{sample}_final.bam', sample=samples.index),
expand('logs/{sample}_hist_out_cell.txt', sample=samples.index),
expand('plots/{sample}_knee_plot.pdf', sample=samples.index),
'reports/star.html',
'plots/violinplots_comparison_UMI.pdf',
# 'plots/UMI_vs_counts.html',
'plots/UMI_vs_counts.pdf',
# 'plots/UMI_vs_gene.html',
'plots/UMI_vs_gene.pdf',
# 'plots/Count_vs_gene.html',
'plots/Count_vs_gene.pdf',
'summary/R_Seurat_objects.rdata',
'plots/yield.pdf'
rule extract:
input:
expand('logs/{sample}_umi_per_gene.tsv', sample=samples.index),
expand('plots/{sample}_rna_metrics.pdf', sample=samples.index),
'summary/umi_expression_matrix.tsv',
'summary/counts_expression_matrix.tsv'
rule split_species:
input:
expand('summary/{species}/{sample}_barcodes.csv', sample=samples.index, species=config['META']['species']),
expand('plots/{sample}_species_plot_genes.pdf', sample=samples.index),
expand('plots/{sample}_species_plot_transcripts.pdf', sample=samples.index),
expand('data/{species}/{sample}_unfiltered.bam', sample=samples.index, species=config['META']['species'])
rule extract_species:
input:
expand('summary/{species}/{sample}_umi_expression_matrix.txt', sample=samples.index, species=config['META']['species']),
expand('summary/{species}/{sample}_counts_expression_matrix.txt', sample=samples.index, species=config['META']['species']),
expand('logs/{species}/{sample}_umi_per_gene.tsv', sample=samples.index, species=config['META']['species']),
expand('summary/Experiment_{species}_counts_expression_matrix.tsv', species=config['META']['species']),
expand('summary/Experiment_{species}_umi_expression_matrix.tsv', species=config['META']['species']),
expand('plots/{species}/{sample}_rna_metrics.pdf', sample=samples.index, species=config['META']['species'])
include: "rules/generate_meta.smk"
include: "rules/fastqc.smk"
include: "rules/filter.smk"
include: "rules/map.smk"
include: "rules/extract_expression_single.smk"
include: "rules/split_species.smk"
include: "rules/extract_expression_species.smk"