-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSCINKD.v2.1.0.GREEDY.snakefile
127 lines (106 loc) · 3.52 KB
/
SCINKD.v2.1.0.GREEDY.snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
#To run this pipeline on any machine running linux, run
#git clone https://github.com/DrPintoThe2nd/SCINKD.git
#mamba create -n scinkd meryl=1.4.1 snakemake pigz r r-dplyr r-ggplot2 mashmap --yes
#mamba activate scinkd
#mamba env export > SCINKD.v2.0.alpha_environment.yml
#snakemake --use-conda -np -s SCINKD.v2.0.alpha.snakefile
configfile: "SCINKD/config.json"
genome = config["prefix"]
rule all:
input:
#meryl_count rule(s)
directory(expand("{genome}.hap1.meryl/", genome=genome)),
directory(expand("{genome}.hap2.meryl/", genome=genome)),
#meryl_diff rule(s)
directory(expand("{genome}.hap1-minus-hap2.meryl/", genome=genome)),
directory(expand("{genome}.hap2-minus-hap1.meryl/", genome=genome)),
#meryl-lookup rule(s)
expand("{genome}.hap1-minus-hap2.bed", genome=genome),
expand("{genome}.hap2-minus-hap1.bed", genome=genome),
#results rule(s)
expand("{genome}.hap1-minus-hap2.results", genome=genome),
expand("{genome}.hap2-minus-hap1.results", genome=genome),
rule meryl_hap1_count:
input:
expand("{genome}.hap1.fasta.gz", genome=genome)
output:
directory("{genome}.hap1.meryl/"),
shell:
"""
meryl count k=28 compress memory=12 threads=12 output {output} {input}
"""
rule meryl_hap2_count:
input:
expand("{genome}.hap2.fasta.gz", genome=genome)
output:
directory("{genome}.hap2.meryl/"),
shell:
"""
meryl count k=28 compress memory=12 threads=12 output {output} {input}
"""
rule meryl_hap1_diff:
input:
hap1 = directory(expand("{genome}.hap1.meryl/", genome=genome)),
hap2 = directory(expand("{genome}.hap2.meryl/", genome=genome)),
output:
directory(expand("{genome}.hap1-minus-hap2.meryl/", genome=genome)),
shell:
"""
meryl difference {input.hap1} {input.hap2} output {output}
"""
rule meryl_hap2_diff:
input:
hap1 = directory(expand("{genome}.hap1.meryl/", genome=genome)),
hap2 = directory(expand("{genome}.hap2.meryl/", genome=genome)),
output:
directory(expand("{genome}.hap2-minus-hap1.meryl/", genome=genome)),
shell:
"""
meryl difference {input.hap2} {input.hap1} output {output}
"""
rule meryl_lookup_hap1:
input:
fa = expand("{genome}.hap1.fasta.gz", genome=genome),
hap1 = directory(expand("{genome}.hap1-minus-hap2.meryl/", genome=genome)),
output:
expand("{genome}.hap1-minus-hap2.bed", genome=genome),
shell:
"""
meryl-lookup -sequence {input.fa} -mers {input.hap1} -bed -output {output}
"""
rule meryl_lookup_hap2:
input:
fa = expand("{genome}.hap2.fasta.gz", genome=genome),
hap2 = directory(expand("{genome}.hap2-minus-hap1.meryl/", genome=genome)),
output:
expand("{genome}.hap2-minus-hap1.bed", genome=genome),
shell:
"""
meryl-lookup -sequence {input.fa} -mers {input.hap2} -bed -output {output}
"""
##calculate number of kmers occuring in each haplotype (this is rather crude at this point)
rule results_hap1:
input:
expand("{genome}.hap1-minus-hap2.bed", genome=genome),
output:
final = expand("{genome}.hap1-minus-hap2.results", genome=genome),
tmp = expand("{genome}.hap1-minus-hap2.txt", genome=genome),
shell:
"""
cut -f1 {input} | uniq -c > {output.tmp}
wait
awk '{{print $2,$1}}' OFS='\t' {output.tmp} > {output.final}
"""
rule results_hap2:
input:
expand("{genome}.hap2-minus-hap1.bed", genome=genome),
output:
final = expand("{genome}.hap2-minus-hap1.results", genome=genome),
tmp = expand("{genome}.hap2-minus-hap1.txt", genome=genome),
shell:
"""
cut -f1 {input} | uniq -c > {output.tmp}
wait
awk '{{print $2,$1}}' OFS='\t' {output.tmp} > {output.final}
"""