Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ assembler: "megahit"
check_contigs: true
# if checking contig length, longest contig must exceed this to proceed to annotation
contig_annotation_thresh: 1000
ncontigs_per_annotation_chunk: 200

# deduping reads is optional so we can use a similar codebase for transcriptomics
dedup_reads: true
Expand Down Expand Up @@ -113,7 +114,7 @@ docker_kneaddata: "docker://nickp60/kneaddata-bmtagger:0.1.0"
docker_kraken: "docker://lgallagh/kraken2:2.1.2"
docker_krona: "docker://nanozoo/krona:2.7.1--e7615f7"
docker_megahit: "docker://ghcr.io/vdblab/megahit:5f329c6d24a1480b75145a4c14567a25453b95bf"
docker_metaerg: "docker://ghcr.io/vdblab/metaerg:1.2.3d"
docker_metaerg: "docker://ghcr.io/vdblab/metaerg:1.2.3g"
docker_metawrap: "docker://ghcr.io/vdblab/metawrap:1.3"
docker_optitype: "docker://fred2/optitype"
docker_phanta: "docker://ghcr.io/vdblab/phanta:1.1.0-dev"
Expand Down
4 changes: 2 additions & 2 deletions development.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ The `--capture=no` allows us to see print statements.
#### USAGE:

```sh
bash test.sh <pipeline_stage> <dataset>
bash test.sh <pipeline_stage> <dataset> <extra args to snakemake>
# eg
bash test.sh preprocess tiny
bash test.sh preprocess tiny --dryrun
```
The ouput will be in a folder called `tmp<pipeline_stage>_<dataset>`.

Expand Down
15 changes: 11 additions & 4 deletions test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@ set -o errexit

stage=$1
rawdataset=$2
extra=${3:-''}
if [ -z "$extra" ]
then
extra=""
else
extra=$3 # use for adding --dryrun, -f somefile.jar , or other extra stuff to the command to be executed
fi
case $rawdataset in
tiny)
echo " WARNING: this dataset will raise errors during binning/annotation and will have empty metaphlan results due to its size"
Expand Down Expand Up @@ -101,7 +108,7 @@ case $rawdataset in
esac
echo $R1

common_args="--snakefile workflow/Snakefile --rerun-incomplete --restart-times 0 --cores 32"
common_args="--snakefile workflow/Snakefile --rerun-incomplete --restart-times 0 --cores 32 $extra"
case $stage in

all)
Expand Down Expand Up @@ -277,7 +284,7 @@ case $stage in
snakemake \
$common_args \
--singularity-args "-B ${PWD},/data/brinkvd/" \
--directory tmp${stage}_${rawdataset}/ \
--directory ${PWD}/tmp${stage}_${rawdataset}/ \
--config sample=473 \
assembly=${PWD}/.test/473/473.assembly.fasta \
R1=$R1 \
Expand All @@ -289,7 +296,7 @@ case $stage in
snakemake \
$common_args \
--singularity-args "-B ${PWD},/data/brinkvd/" \
--directory tmp${stage}_${rawdataset}/ \
--directory ${PWD}/tmp${stage}_${rawdataset}/ \
--config sample=473 \
R1=$R1 \
R2=$R2 \
Expand All @@ -301,7 +308,7 @@ case $stage in
snakemake \
$common_args \
--singularity-args "-B ${PWD},/data/brinkvd/" \
--directory tmprgi_${rawdataset}/ \
--directory ${PWD}/tmprgi_${rawdataset}/ \
--config sample=473 \
R1=$R1 \
R2=$R2 \
Expand Down
2 changes: 2 additions & 0 deletions tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ def test_make_assembly_split_names():
split_names = make_assembly_split_names(10000)
assert len(str(split_names[0])) == 3, "bad padding!"
assert len(str(split_names[1001])) == 4, "bad padding!"
assert make_assembly_split_names(4) == ["001", "002", "003", "004"]
assert make_assembly_split_names(100)[99] == "100"

def test_bbmap_dedup_params_flags():
test_conditions = [
Expand Down
92 changes: 58 additions & 34 deletions workflow/rules/annotate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import sys
from shutil import rmtree
import glob
from math import ceil
import yaml


include: "common.smk"
Expand Down Expand Up @@ -49,15 +50,21 @@ if config["check_contigs"]:
)


nseqs = 2
nseqs = config["ncontigs_per_annotation_chunk"]

# This was being done in onstart, but during subsequent evaluation of the snakefile it was falling back to a default value
nb = None # number of bases in contig
if not os.path.exists("tmp_nparts"):
ncontigs = 0
with open(config["assembly"], "r") as inf:
for line in inf:
if line.startswith(">"):
ncontigs = ncontigs + 1
# only increment ncontigs is contig has over 1000 bases
if nb is None or nb > config["contig_annotation_thresh"]:
ncontigs = ncontigs + 1
nb = 0
else:
nb = nb + len(line)
nparts = ceil(ncontigs / nseqs)
with open("tmp_nparts", "w") as npart_file:
npart_file.write(f"{nparts}\n")
Expand All @@ -66,7 +73,7 @@ with open("tmp_nparts", "r") as npart_file:

logger.info(f"Breaking assembly into {nparts} {nseqs}-contig chunks")


# this is called stdin due to how we are running seqkit
BATCHES = [f"stdin.part_{x}" for x in make_assembly_split_names(nparts)]

if len(config["R1"]) == 1:
Expand All @@ -86,6 +93,11 @@ use rule concat_lanes_fix_names from utils as utils_concat_lanes_fix_names with:
e="logs/concat_lanes_fix_names_{sample}_R{rd}.e",


onstart:
with open("config_used.yaml", "w") as outfile:
yaml.dump(config, outfile)


rule all:
input:
outputs,
Expand All @@ -97,9 +109,10 @@ rule annotate_orfs:
input:
assembly="tmp/{batch}.fasta",
output:
gff="annotation/annotation_{batch}/data/either_all_or_master.gff",
ffn="annotation/annotation_{batch}/data/cds.ffn",
faa="annotation/annotation_{batch}/data/cds.faa",
outdir=temp(directory("annotation/annotation_{batch}/")),

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think probably the outdir in the command below should be updated to make use of the $output.outdir variable?

gff=temp("annotation/annotation_{batch}.either_all_or_master.gff"),
ffn=temp("annotation/annotation_{batch}.cds.ffn"),
faa=temp("annotation/annotation_{batch}.cds.faa"),
resources:
mem_mb=8 * 1024,
runtime=lambda wc, attempt: 45 * attempt,
Expand All @@ -109,6 +122,7 @@ rule annotate_orfs:
shell:
"""
# metaerg will reuse existing incomplete outputs if outdir is present, so we delete the whole lot if present.
# it doesn't seem to be respecting the --force arg
# we could use directory(), but thats going to complicate referencing outputs in downstream rules
if [ -d annotation/annotation_{wildcards.batch} ];
then
Expand All @@ -119,16 +133,21 @@ rule annotate_orfs:
# see issues https://github.com/xiaoli-dong/metaerg/pull/38 and
# https://github.com/xiaoli-dong/metaerg/issues/12
set -e
metaerg.pl --cpus {threads} --dbdir {params.metaerg_db_dir} --outdir annotation/annotation_{wildcards.batch} --locustag {wildcards.batch} {input.assembly} --force || echo "Finished running Metaerg"
metaerg.pl --cpus {threads} --dbdir {params.metaerg_db_dir} --outdir {output.outdir} --locustag {wildcards.batch} {input.assembly} --force || echo "Finished running Metaerg"
# if metaerg successfully packaged everything up
if [ -f "annotation/annotation_{wildcards.batch}/data/master.gff.txt" ]
then
mv annotation/annotation_{wildcards.batch}/data/master.gff.txt {output.gff}
else
# TODO: see if this logic is still necessary
# if it succeeded but failed at output_report.pl, no need to do anything
echo "sample likely failed at output_report.pl but gff should be present"
mv annotation/annotation_{wildcards.batch}/data/all.gff {output.gff}
fi
mv annotation/annotation_{wildcards.batch}/data/cds.ffn annotation/annotation_{wildcards.batch}.cds.ffn
mv annotation/annotation_{wildcards.batch}/data/cds.faa annotation/annotation_{wildcards.batch}.cds.faa

find annotation/annotation_{wildcards.batch}
"""


Expand Down Expand Up @@ -211,14 +230,14 @@ rule annotate_AMR:

rule split_assembly:
"""
These dummy inputs are intended to be overwritten when importing the rule
"""
input:
assembly=config["assembly"],
output:
directory("tmp"),
chunks=expand("tmp/{batch}.fasta", batch=BATCHES),
chunks=temp(expand("tmp/{batch}.fasta", batch=BATCHES)),
assembly=temp("tmp-" + os.path.basename(config["assembly"])),
assembly_fai=temp("tmp-" + os.path.basename(config["assembly"]) + ".seqkit.fai"),
params:
outdir="tmp/",
nbatches=len(BATCHES),
Expand Down Expand Up @@ -255,8 +274,8 @@ def get_annotate_cazi_memory(wildcards, attempt):

rule annotate_CAZI_split:
input:
faa="annotation/annotation_{batch}/data/cds.faa",
gff="annotation/annotation_{batch}/data/either_all_or_master.gff",
faa="annotation/annotation_{batch}.cds.faa",
gff="annotation/annotation_{batch}.either_all_or_master.gff",
output:
overview="cazi_db_scan/{batch}/overview.txt",
substrate="cazi_db_scan/{batch}/substrate.out",
Expand Down Expand Up @@ -287,11 +306,11 @@ rule annotate_CAZI_split:
rule join_metaerg_outputs:
input:
gff=expand(
"annotation/annotation_{batch}/data/either_all_or_master.gff",
"annotation/annotation_{batch}.either_all_or_master.gff",
batch=BATCHES,
),
ffn=expand(
"annotation/annotation_{batch}/data/cds.ffn",
"annotation/annotation_{batch}.cds.ffn",
batch=BATCHES,
),
output:
Expand All @@ -316,24 +335,32 @@ rule join_metaerg_outputs:

rule align_annotated_genes:
input:
ffn="annotation/annotation_{batch}/data/cds.ffn",
ffn=f"{config['sample']}_metaerg.ffn",
r1=input_R1,
r2=input_R2,
output:
bamfile="annotation/annotation_{batch}/aligned_reads.bam",
bamfile=temp("aligned_reads.bam"),
index=temp(
multiext(
"bowtie/bowtie2_index",
".1.bt2",
".2.bt2",
".3.bt2",
".4.bt2",
".rev.1.bt2",
".rev.2.bt2",
)
),
container:
config["docker_bowtie2"]
threads: 16
resources:
mem_mb=16 * 1024,
runtime=get_annotate_cazi_runtime,
cores=16,
params:
bowtie_dir="annotation/annotation_{batch}/bowtie",
bowtie_index="annotation/annotation_{batch}/bowtie/bowtie2_index",
bowtie_index="bowtie/bowtie2_index",
shell:
"""
mkdir -p {params.bowtie_dir}
bowtie2-build \
--threads {threads} \
{input.ffn} \
Expand All @@ -344,10 +371,10 @@ rule align_annotated_genes:

rule seqkit_annotate_ffn:
input:
ffn="annotation/annotation_{batch}/data/cds.ffn",
ffn=f"{config['sample']}_metaerg.ffn",
output:
length_file="annotation/annotation_{batch}/seqkit.length",
bed_file="annotation/annotation_{batch}/seqkit.bed",
length_file=temp(f"{config['sample']}_metaerg.seqkit.length"),
bed_file=f"{config['sample']}_metaerg.seqkit.bed",
container:
config["docker_seqkit"]
shell:
Expand All @@ -359,11 +386,11 @@ rule seqkit_annotate_ffn:

rule bedtools_coverage:
input:
length_file="annotation/annotation_{batch}/seqkit.length",
bed_file="annotation/annotation_{batch}/seqkit.bed",
bamfile="annotation/annotation_{batch}/aligned_reads.bam",
length_file=f"{config['sample']}_metaerg.seqkit.length",
bed_file=f"{config['sample']}_metaerg.seqkit.bed",
bamfile="aligned_reads.bam",
output:
coverage="annotation/annotation_{batch}/annotated_gene_coverage.txt",
coverage=f"{config['sample']}_metaerg.annotated_gene_coverage.txt",
container:
config["docker_bedtools"]
shell:
Expand All @@ -374,13 +401,13 @@ rule bedtools_coverage:

rule create_RPM_counts:
input:
coverage="annotation/annotation_{batch}/annotated_gene_coverage.txt",
overview="cazi_db_scan/{batch}/overview.txt",
substrate="cazi_db_scan/{batch}/substrate.out",
cgc="cazi_db_scan/{batch}/cgc.out",
coverage=f"{config['sample']}_metaerg.annotated_gene_coverage.txt",
overview=f"{config['sample']}_cazi_overview.txt",
substrate=f"{config['sample']}_cazi_substrate.out",
cgc=f"{config['sample']}_cazi_cgc.out",
r1=config["R1"],
output:
rpm_file="cazi_db_scan/{batch}/annoted_cazymes_RPM.tsv",
rpm_file=f"{config['sample']}_annotated_cazymes_RPM.tsv",
conda:
"../envs/annotate_output_parse.yaml"
script:
Expand All @@ -392,12 +419,10 @@ rule join_CAZI:
overview=expand("cazi_db_scan/{batch}/overview.txt", batch=BATCHES),
substrate=expand("cazi_db_scan/{batch}/substrate.out", batch=BATCHES),
cgc=expand("cazi_db_scan/{batch}/cgc.out", batch=BATCHES),
rpm=expand("cazi_db_scan/{batch}/annoted_cazymes_RPM.tsv", batch=BATCHES),
output:
overview=f"{config['sample']}_cazi_overview.txt",
substrate=f"{config['sample']}_cazi_substrate.out",
cgc=f"{config['sample']}_cazi_cgc.out",
rpm=f"{config['sample']}_annotated_cazymes_RPM.tsv",
shell:
"""
join_files(){{
Expand All @@ -415,5 +440,4 @@ rule join_CAZI:
join_files {output.overview} {input.overview}
join_files {output.substrate} {input.substrate}
join_files {output.cgc} {input.cgc}
join_files {output.rpm} {input.rpm}
"""
2 changes: 1 addition & 1 deletion workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def make_shard_names(nshards):
def make_assembly_split_names(nparts):
split_names = []
# deal with the 3 digit ones first
for i in range(1, min(nparts, 100)):
for i in range(1, min(nparts + 1, 100)):
split_names.append(f"{i:03}")
# values after 99 are not padded
if nparts > 99:
Expand Down