diff --git a/config/config.yaml b/config/config.yaml index 760bdc0..076036e 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -79,6 +79,7 @@ assembler: "megahit" check_contigs: true # if checking contig length, longest contig must exceed this to proceed to annotation contig_annotation_thresh: 1000 +ncontigs_per_annotation_chunk: 200 # deduping reads is optional so we can use a similar codebase for transcriptomics dedup_reads: true @@ -113,7 +114,7 @@ docker_kneaddata: "docker://nickp60/kneaddata-bmtagger:0.1.0" docker_kraken: "docker://lgallagh/kraken2:2.1.2" docker_krona: "docker://nanozoo/krona:2.7.1--e7615f7" docker_megahit: "docker://ghcr.io/vdblab/megahit:5f329c6d24a1480b75145a4c14567a25453b95bf" -docker_metaerg: "docker://ghcr.io/vdblab/metaerg:1.2.3d" +docker_metaerg: "docker://ghcr.io/vdblab/metaerg:1.2.3g" docker_metawrap: "docker://ghcr.io/vdblab/metawrap:1.3" docker_optitype: "docker://fred2/optitype" docker_phanta: "docker://ghcr.io/vdblab/phanta:1.1.0-dev" diff --git a/development.md b/development.md index 2e25174..e80a2b8 100644 --- a/development.md +++ b/development.md @@ -36,9 +36,9 @@ The `--capture=no` allows us to see print statements. #### USAGE: ```sh -bash test.sh +bash test.sh # eg -bash test.sh preprocess tiny +bash test.sh preprocess tiny --dryrun ``` The ouput will be in a folder called `tmp_`. diff --git a/test.sh b/test.sh index ff631c8..893abc5 100755 --- a/test.sh +++ b/test.sh @@ -5,6 +5,13 @@ set -o errexit stage=$1 rawdataset=$2 +extra=${3:-''} +if [ -z "$extra" ] +then + extra="" +else + extra=$3 # use for adding --dryrun, -f somefile.jar , or other extra stuff to the command to be executed +fi case $rawdataset in tiny) echo " WARNING: this dataset will raise errors during binning/annotation and will have empty metaphlan results due to its size" @@ -101,7 +108,7 @@ case $rawdataset in esac echo $R1 -common_args="--snakefile workflow/Snakefile --rerun-incomplete --restart-times 0 --cores 32" +common_args="--snakefile workflow/Snakefile --rerun-incomplete --restart-times 0 --cores 32 $extra" case $stage in all) @@ -277,7 +284,7 @@ case $stage in snakemake \ $common_args \ --singularity-args "-B ${PWD},/data/brinkvd/" \ - --directory tmp${stage}_${rawdataset}/ \ + --directory ${PWD}/tmp${stage}_${rawdataset}/ \ --config sample=473 \ assembly=${PWD}/.test/473/473.assembly.fasta \ R1=$R1 \ @@ -289,7 +296,7 @@ case $stage in snakemake \ $common_args \ --singularity-args "-B ${PWD},/data/brinkvd/" \ - --directory tmp${stage}_${rawdataset}/ \ + --directory ${PWD}/tmp${stage}_${rawdataset}/ \ --config sample=473 \ R1=$R1 \ R2=$R2 \ @@ -301,7 +308,7 @@ case $stage in snakemake \ $common_args \ --singularity-args "-B ${PWD},/data/brinkvd/" \ - --directory tmprgi_${rawdataset}/ \ + --directory ${PWD}/tmprgi_${rawdataset}/ \ --config sample=473 \ R1=$R1 \ R2=$R2 \ diff --git a/tests/test_common.py b/tests/test_common.py index 5de7a65..70730e9 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -23,6 +23,8 @@ def test_make_assembly_split_names(): split_names = make_assembly_split_names(10000) assert len(str(split_names[0])) == 3, "bad padding!" assert len(str(split_names[1001])) == 4, "bad padding!" + assert make_assembly_split_names(4) == ["001", "002", "003", "004"] + assert make_assembly_split_names(100)[99] == "100" def test_bbmap_dedup_params_flags(): test_conditions = [ diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk index 229b99a..977c377 100644 --- a/workflow/rules/annotate.smk +++ b/workflow/rules/annotate.smk @@ -3,6 +3,7 @@ import sys from shutil import rmtree import glob from math import ceil +import yaml include: "common.smk" @@ -49,15 +50,21 @@ if config["check_contigs"]: ) -nseqs = 2 +nseqs = config["ncontigs_per_annotation_chunk"] # This was being done in onstart, but during subsequent evaluation of the snakefile it was falling back to a default value +nb = None # number of bases in contig if not os.path.exists("tmp_nparts"): ncontigs = 0 with open(config["assembly"], "r") as inf: for line in inf: if line.startswith(">"): - ncontigs = ncontigs + 1 + # only increment ncontigs is contig has over 1000 bases + if nb is None or nb > config["contig_annotation_thresh"]: + ncontigs = ncontigs + 1 + nb = 0 + else: + nb = nb + len(line) nparts = ceil(ncontigs / nseqs) with open("tmp_nparts", "w") as npart_file: npart_file.write(f"{nparts}\n") @@ -66,7 +73,7 @@ with open("tmp_nparts", "r") as npart_file: logger.info(f"Breaking assembly into {nparts} {nseqs}-contig chunks") - +# this is called stdin due to how we are running seqkit BATCHES = [f"stdin.part_{x}" for x in make_assembly_split_names(nparts)] if len(config["R1"]) == 1: @@ -86,6 +93,11 @@ use rule concat_lanes_fix_names from utils as utils_concat_lanes_fix_names with: e="logs/concat_lanes_fix_names_{sample}_R{rd}.e", +onstart: + with open("config_used.yaml", "w") as outfile: + yaml.dump(config, outfile) + + rule all: input: outputs, @@ -97,9 +109,10 @@ rule annotate_orfs: input: assembly="tmp/{batch}.fasta", output: - gff="annotation/annotation_{batch}/data/either_all_or_master.gff", - ffn="annotation/annotation_{batch}/data/cds.ffn", - faa="annotation/annotation_{batch}/data/cds.faa", + outdir=temp(directory("annotation/annotation_{batch}/")), + gff=temp("annotation/annotation_{batch}.either_all_or_master.gff"), + ffn=temp("annotation/annotation_{batch}.cds.ffn"), + faa=temp("annotation/annotation_{batch}.cds.faa"), resources: mem_mb=8 * 1024, runtime=lambda wc, attempt: 45 * attempt, @@ -109,6 +122,7 @@ rule annotate_orfs: shell: """ # metaerg will reuse existing incomplete outputs if outdir is present, so we delete the whole lot if present. + # it doesn't seem to be respecting the --force arg # we could use directory(), but thats going to complicate referencing outputs in downstream rules if [ -d annotation/annotation_{wildcards.batch} ]; then @@ -119,16 +133,21 @@ rule annotate_orfs: # see issues https://github.com/xiaoli-dong/metaerg/pull/38 and # https://github.com/xiaoli-dong/metaerg/issues/12 set -e - metaerg.pl --cpus {threads} --dbdir {params.metaerg_db_dir} --outdir annotation/annotation_{wildcards.batch} --locustag {wildcards.batch} {input.assembly} --force || echo "Finished running Metaerg" + metaerg.pl --cpus {threads} --dbdir {params.metaerg_db_dir} --outdir {output.outdir} --locustag {wildcards.batch} {input.assembly} --force || echo "Finished running Metaerg" # if metaerg successfully packaged everything up if [ -f "annotation/annotation_{wildcards.batch}/data/master.gff.txt" ] then mv annotation/annotation_{wildcards.batch}/data/master.gff.txt {output.gff} else + # TODO: see if this logic is still necessary # if it succeeded but failed at output_report.pl, no need to do anything echo "sample likely failed at output_report.pl but gff should be present" mv annotation/annotation_{wildcards.batch}/data/all.gff {output.gff} fi + mv annotation/annotation_{wildcards.batch}/data/cds.ffn annotation/annotation_{wildcards.batch}.cds.ffn + mv annotation/annotation_{wildcards.batch}/data/cds.faa annotation/annotation_{wildcards.batch}.cds.faa + + find annotation/annotation_{wildcards.batch} """ @@ -211,14 +230,14 @@ rule annotate_AMR: rule split_assembly: """ - These dummy inputs are intended to be overwritten when importing the rule """ input: assembly=config["assembly"], output: directory("tmp"), - chunks=expand("tmp/{batch}.fasta", batch=BATCHES), + chunks=temp(expand("tmp/{batch}.fasta", batch=BATCHES)), assembly=temp("tmp-" + os.path.basename(config["assembly"])), + assembly_fai=temp("tmp-" + os.path.basename(config["assembly"]) + ".seqkit.fai"), params: outdir="tmp/", nbatches=len(BATCHES), @@ -255,8 +274,8 @@ def get_annotate_cazi_memory(wildcards, attempt): rule annotate_CAZI_split: input: - faa="annotation/annotation_{batch}/data/cds.faa", - gff="annotation/annotation_{batch}/data/either_all_or_master.gff", + faa="annotation/annotation_{batch}.cds.faa", + gff="annotation/annotation_{batch}.either_all_or_master.gff", output: overview="cazi_db_scan/{batch}/overview.txt", substrate="cazi_db_scan/{batch}/substrate.out", @@ -287,11 +306,11 @@ rule annotate_CAZI_split: rule join_metaerg_outputs: input: gff=expand( - "annotation/annotation_{batch}/data/either_all_or_master.gff", + "annotation/annotation_{batch}.either_all_or_master.gff", batch=BATCHES, ), ffn=expand( - "annotation/annotation_{batch}/data/cds.ffn", + "annotation/annotation_{batch}.cds.ffn", batch=BATCHES, ), output: @@ -316,24 +335,32 @@ rule join_metaerg_outputs: rule align_annotated_genes: input: - ffn="annotation/annotation_{batch}/data/cds.ffn", + ffn=f"{config['sample']}_metaerg.ffn", r1=input_R1, r2=input_R2, output: - bamfile="annotation/annotation_{batch}/aligned_reads.bam", + bamfile=temp("aligned_reads.bam"), + index=temp( + multiext( + "bowtie/bowtie2_index", + ".1.bt2", + ".2.bt2", + ".3.bt2", + ".4.bt2", + ".rev.1.bt2", + ".rev.2.bt2", + ) + ), container: config["docker_bowtie2"] threads: 16 resources: mem_mb=16 * 1024, runtime=get_annotate_cazi_runtime, - cores=16, params: - bowtie_dir="annotation/annotation_{batch}/bowtie", - bowtie_index="annotation/annotation_{batch}/bowtie/bowtie2_index", + bowtie_index="bowtie/bowtie2_index", shell: """ - mkdir -p {params.bowtie_dir} bowtie2-build \ --threads {threads} \ {input.ffn} \ @@ -344,10 +371,10 @@ rule align_annotated_genes: rule seqkit_annotate_ffn: input: - ffn="annotation/annotation_{batch}/data/cds.ffn", + ffn=f"{config['sample']}_metaerg.ffn", output: - length_file="annotation/annotation_{batch}/seqkit.length", - bed_file="annotation/annotation_{batch}/seqkit.bed", + length_file=temp(f"{config['sample']}_metaerg.seqkit.length"), + bed_file=f"{config['sample']}_metaerg.seqkit.bed", container: config["docker_seqkit"] shell: @@ -359,11 +386,11 @@ rule seqkit_annotate_ffn: rule bedtools_coverage: input: - length_file="annotation/annotation_{batch}/seqkit.length", - bed_file="annotation/annotation_{batch}/seqkit.bed", - bamfile="annotation/annotation_{batch}/aligned_reads.bam", + length_file=f"{config['sample']}_metaerg.seqkit.length", + bed_file=f"{config['sample']}_metaerg.seqkit.bed", + bamfile="aligned_reads.bam", output: - coverage="annotation/annotation_{batch}/annotated_gene_coverage.txt", + coverage=f"{config['sample']}_metaerg.annotated_gene_coverage.txt", container: config["docker_bedtools"] shell: @@ -374,13 +401,13 @@ rule bedtools_coverage: rule create_RPM_counts: input: - coverage="annotation/annotation_{batch}/annotated_gene_coverage.txt", - overview="cazi_db_scan/{batch}/overview.txt", - substrate="cazi_db_scan/{batch}/substrate.out", - cgc="cazi_db_scan/{batch}/cgc.out", + coverage=f"{config['sample']}_metaerg.annotated_gene_coverage.txt", + overview=f"{config['sample']}_cazi_overview.txt", + substrate=f"{config['sample']}_cazi_substrate.out", + cgc=f"{config['sample']}_cazi_cgc.out", r1=config["R1"], output: - rpm_file="cazi_db_scan/{batch}/annoted_cazymes_RPM.tsv", + rpm_file=f"{config['sample']}_annotated_cazymes_RPM.tsv", conda: "../envs/annotate_output_parse.yaml" script: @@ -392,12 +419,10 @@ rule join_CAZI: overview=expand("cazi_db_scan/{batch}/overview.txt", batch=BATCHES), substrate=expand("cazi_db_scan/{batch}/substrate.out", batch=BATCHES), cgc=expand("cazi_db_scan/{batch}/cgc.out", batch=BATCHES), - rpm=expand("cazi_db_scan/{batch}/annoted_cazymes_RPM.tsv", batch=BATCHES), output: overview=f"{config['sample']}_cazi_overview.txt", substrate=f"{config['sample']}_cazi_substrate.out", cgc=f"{config['sample']}_cazi_cgc.out", - rpm=f"{config['sample']}_annotated_cazymes_RPM.tsv", shell: """ join_files(){{ @@ -415,5 +440,4 @@ rule join_CAZI: join_files {output.overview} {input.overview} join_files {output.substrate} {input.substrate} join_files {output.cgc} {input.cgc} - join_files {output.rpm} {input.rpm} """ diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 1ed3304..d5a39fc 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -41,7 +41,7 @@ def make_shard_names(nshards): def make_assembly_split_names(nparts): split_names = [] # deal with the 3 digit ones first - for i in range(1, min(nparts, 100)): + for i in range(1, min(nparts + 1, 100)): split_names.append(f"{i:03}") # values after 99 are not padded if nparts > 99: