vdblab · nickp60 · Apr 25, 2025 · Apr 19, 2025 · Apr 21, 2025 · Apr 21, 2025
diff --git a/config/config.yaml b/config/config.yaml
@@ -79,6 +79,7 @@ assembler: "megahit"
 check_contigs: true
 # if checking contig length, longest contig must exceed this to proceed to annotation
 contig_annotation_thresh: 1000
+ncontigs_per_annotation_chunk: 200
 
 # deduping reads is optional so we can use a similar codebase for transcriptomics
 dedup_reads: true
@@ -113,7 +114,7 @@ docker_kneaddata: "docker://nickp60/kneaddata-bmtagger:0.1.0"
 docker_kraken: "docker://lgallagh/kraken2:2.1.2"
 docker_krona: "docker://nanozoo/krona:2.7.1--e7615f7"
 docker_megahit: "docker://ghcr.io/vdblab/megahit:5f329c6d24a1480b75145a4c14567a25453b95bf"
-docker_metaerg: "docker://ghcr.io/vdblab/metaerg:1.2.3d"
+docker_metaerg: "docker://ghcr.io/vdblab/metaerg:1.2.3g"
 docker_metawrap: "docker://ghcr.io/vdblab/metawrap:1.3"
 docker_optitype: "docker://fred2/optitype"
 docker_phanta: "docker://ghcr.io/vdblab/phanta:1.1.0-dev"

diff --git a/development.md b/development.md
@@ -36,9 +36,9 @@ The `--capture=no` allows us to see print statements.
 #### USAGE:
 
 ```sh
-bash test.sh <pipeline_stage> <dataset>
+bash test.sh <pipeline_stage> <dataset> <extra args to snakemake>
 # eg
-bash test.sh preprocess tiny
+bash test.sh preprocess tiny --dryrun
 ```
 The ouput will be in a folder called `tmp<pipeline_stage>_<dataset>`.
 

diff --git a/test.sh b/test.sh
@@ -5,6 +5,13 @@ set -o errexit
 
 stage=$1
 rawdataset=$2
+extra=${3:-''}
+if [ -z "$extra" ]
+then
+    extra=""
+else
+    extra=$3 # use for adding --dryrun, -f somefile.jar , or other extra stuff to the command to be executed
+fi
 case $rawdataset in
     tiny)
 	echo " WARNING: this dataset will raise errors during binning/annotation and will have empty metaphlan results due to its size"
@@ -101,7 +108,7 @@ case $rawdataset in
 esac
 echo $R1
 
-common_args="--snakefile workflow/Snakefile  --rerun-incomplete --restart-times 0 --cores 32"
+common_args="--snakefile workflow/Snakefile  --rerun-incomplete --restart-times 0 --cores 32 $extra"
 case $stage in
 
      all)
@@ -277,7 +284,7 @@ case $stage in
 	snakemake \
 	    $common_args \
 	    --singularity-args "-B ${PWD},/data/brinkvd/" \
-	    --directory tmp${stage}_${rawdataset}/ \
+	    --directory ${PWD}/tmp${stage}_${rawdataset}/ \
 	    --config sample=473 \
 	    assembly=${PWD}/.test/473/473.assembly.fasta  \
 	    R1=$R1 \
@@ -289,7 +296,7 @@ case $stage in
 	snakemake \
 	    $common_args \
 	    --singularity-args "-B ${PWD},/data/brinkvd/" \
-	    --directory tmp${stage}_${rawdataset}/ \
+	    --directory ${PWD}/tmp${stage}_${rawdataset}/ \
 	    --config sample=473 \
 	    R1=$R1 \
 	    R2=$R2 \
@@ -301,7 +308,7 @@ case $stage in
 	snakemake \
 	    $common_args \
 	    --singularity-args "-B ${PWD},/data/brinkvd/" \
-	    --directory tmprgi_${rawdataset}/ \
+	    --directory ${PWD}/tmprgi_${rawdataset}/ \
 	    --config sample=473 \
 	    R1=$R1 \
 	    R2=$R2 \

diff --git a/tests/test_common.py b/tests/test_common.py
@@ -23,6 +23,8 @@ def test_make_assembly_split_names():
     split_names = make_assembly_split_names(10000)
     assert len(str(split_names[0])) == 3, "bad padding!"
     assert len(str(split_names[1001])) == 4, "bad padding!"
+    assert make_assembly_split_names(4) == ["001", "002", "003", "004"]
+    assert make_assembly_split_names(100)[99] == "100"
 
 def test_bbmap_dedup_params_flags():
     test_conditions = [

diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk
@@ -3,6 +3,7 @@ import sys
 from shutil import rmtree
 import glob
 from math import ceil
+import yaml
 
 
 include: "common.smk"
@@ -49,15 +50,21 @@ if config["check_contigs"]:
     )
 
 
-nseqs = 2
+nseqs = config["ncontigs_per_annotation_chunk"]
 
 # This was being done in onstart, but during subsequent evaluation of the snakefile it was falling back to a default value
+nb = None  # number of bases in contig
 if not os.path.exists("tmp_nparts"):
     ncontigs = 0
     with open(config["assembly"], "r") as inf:
         for line in inf:
             if line.startswith(">"):
-                ncontigs = ncontigs + 1
+                # only increment ncontigs is contig has over 1000 bases
+                if nb is None or nb > config["contig_annotation_thresh"]:
+                    ncontigs = ncontigs + 1
+                nb = 0
+            else:
+                nb = nb + len(line)
     nparts = ceil(ncontigs / nseqs)
     with open("tmp_nparts", "w") as npart_file:
         npart_file.write(f"{nparts}\n")
@@ -66,7 +73,7 @@ with open("tmp_nparts", "r") as npart_file:
 
 logger.info(f"Breaking assembly into {nparts} {nseqs}-contig chunks")
 
-
+# this is called stdin due to how we are running seqkit
 BATCHES = [f"stdin.part_{x}" for x in make_assembly_split_names(nparts)]
 
 if len(config["R1"]) == 1:
@@ -86,6 +93,11 @@ use rule concat_lanes_fix_names from utils as utils_concat_lanes_fix_names with:
         e="logs/concat_lanes_fix_names_{sample}_R{rd}.e",
 
 
+onstart:
+    with open("config_used.yaml", "w") as outfile:
+        yaml.dump(config, outfile)
+
+
 rule all:
     input:
         outputs,
@@ -97,9 +109,10 @@ rule annotate_orfs:
     input:
         assembly="tmp/{batch}.fasta",
     output:
-        gff="annotation/annotation_{batch}/data/either_all_or_master.gff",
-        ffn="annotation/annotation_{batch}/data/cds.ffn",
-        faa="annotation/annotation_{batch}/data/cds.faa",
+        outdir=temp(directory("annotation/annotation_{batch}/")),
+        gff=temp("annotation/annotation_{batch}.either_all_or_master.gff"),
+        ffn=temp("annotation/annotation_{batch}.cds.ffn"),
+        faa=temp("annotation/annotation_{batch}.cds.faa"),
     resources:
         mem_mb=8 * 1024,
         runtime=lambda wc, attempt: 45 * attempt,
@@ -109,6 +122,7 @@ rule annotate_orfs:
     shell:
         """
         # metaerg will reuse existing incomplete outputs if outdir is present, so we delete the whole lot if present.
+        # it doesn't seem to be respecting the --force arg
         # we could use directory(), but thats going to complicate referencing outputs in downstream rules
         if [ -d annotation/annotation_{wildcards.batch} ];
         then
@@ -119,16 +133,21 @@ rule annotate_orfs:
         # see issues https://github.com/xiaoli-dong/metaerg/pull/38 and
         # https://github.com/xiaoli-dong/metaerg/issues/12
         set -e
-        metaerg.pl --cpus {threads} --dbdir {params.metaerg_db_dir} --outdir annotation/annotation_{wildcards.batch} --locustag {wildcards.batch} {input.assembly} --force || echo "Finished running Metaerg"
+        metaerg.pl --cpus {threads} --dbdir {params.metaerg_db_dir} --outdir {output.outdir} --locustag {wildcards.batch} {input.assembly} --force || echo "Finished running Metaerg"
         # if metaerg successfully packaged everything up
         if [ -f "annotation/annotation_{wildcards.batch}/data/master.gff.txt" ]
         then
             mv annotation/annotation_{wildcards.batch}/data/master.gff.txt {output.gff}
         else
+            # TODO: see if this logic is still necessary
             # if it succeeded but failed at output_report.pl, no need to do anything
             echo "sample likely failed at output_report.pl but gff should be present"
             mv annotation/annotation_{wildcards.batch}/data/all.gff {output.gff}
         fi
+        mv annotation/annotation_{wildcards.batch}/data/cds.ffn annotation/annotation_{wildcards.batch}.cds.ffn
+        mv annotation/annotation_{wildcards.batch}/data/cds.faa annotation/annotation_{wildcards.batch}.cds.faa
+
+        find annotation/annotation_{wildcards.batch}
         """
 
 
@@ -211,14 +230,14 @@ rule annotate_AMR:
 
 rule split_assembly:
     """
-    These dummy inputs are intended to be overwritten when importing the rule
     """
     input:
         assembly=config["assembly"],
     output:
         directory("tmp"),
-        chunks=expand("tmp/{batch}.fasta", batch=BATCHES),
+        chunks=temp(expand("tmp/{batch}.fasta", batch=BATCHES)),
         assembly=temp("tmp-" + os.path.basename(config["assembly"])),
+        assembly_fai=temp("tmp-" + os.path.basename(config["assembly"]) + ".seqkit.fai"),
     params:
         outdir="tmp/",
         nbatches=len(BATCHES),
@@ -255,8 +274,8 @@ def get_annotate_cazi_memory(wildcards, attempt):
 
 rule annotate_CAZI_split:
     input:
-        faa="annotation/annotation_{batch}/data/cds.faa",
-        gff="annotation/annotation_{batch}/data/either_all_or_master.gff",
+        faa="annotation/annotation_{batch}.cds.faa",
+        gff="annotation/annotation_{batch}.either_all_or_master.gff",
     output:
         overview="cazi_db_scan/{batch}/overview.txt",
         substrate="cazi_db_scan/{batch}/substrate.out",
@@ -287,11 +306,11 @@ rule annotate_CAZI_split:
 rule join_metaerg_outputs:
     input:
         gff=expand(
-            "annotation/annotation_{batch}/data/either_all_or_master.gff",
+            "annotation/annotation_{batch}.either_all_or_master.gff",
             batch=BATCHES,
         ),
         ffn=expand(
-            "annotation/annotation_{batch}/data/cds.ffn",
+            "annotation/annotation_{batch}.cds.ffn",
             batch=BATCHES,
         ),
     output:
@@ -316,24 +335,32 @@ rule join_metaerg_outputs:
 
 rule align_annotated_genes:
     input:
-        ffn="annotation/annotation_{batch}/data/cds.ffn",
+        ffn=f"{config['sample']}_metaerg.ffn",
         r1=input_R1,
         r2=input_R2,
     output:
-        bamfile="annotation/annotation_{batch}/aligned_reads.bam",
+        bamfile=temp("aligned_reads.bam"),
+        index=temp(
+            multiext(
+                "bowtie/bowtie2_index",
+                ".1.bt2",
+                ".2.bt2",
+                ".3.bt2",
+                ".4.bt2",
+                ".rev.1.bt2",
+                ".rev.2.bt2",
+            )
+        ),
     container:
         config["docker_bowtie2"]
     threads: 16
     resources:
         mem_mb=16 * 1024,
         runtime=get_annotate_cazi_runtime,
-        cores=16,
     params:
-        bowtie_dir="annotation/annotation_{batch}/bowtie",
-        bowtie_index="annotation/annotation_{batch}/bowtie/bowtie2_index",
+        bowtie_index="bowtie/bowtie2_index",
     shell:
         """
-        mkdir -p {params.bowtie_dir}
         bowtie2-build \
             --threads {threads} \
             {input.ffn} \
@@ -344,10 +371,10 @@ rule align_annotated_genes:
 
 rule seqkit_annotate_ffn:
     input:
-        ffn="annotation/annotation_{batch}/data/cds.ffn",
+        ffn=f"{config['sample']}_metaerg.ffn",
     output:
-        length_file="annotation/annotation_{batch}/seqkit.length",
-        bed_file="annotation/annotation_{batch}/seqkit.bed",
+        length_file=temp(f"{config['sample']}_metaerg.seqkit.length"),
+        bed_file=f"{config['sample']}_metaerg.seqkit.bed",
     container:
         config["docker_seqkit"]
     shell:
@@ -359,11 +386,11 @@ rule seqkit_annotate_ffn:
 
 rule bedtools_coverage:
     input:
-        length_file="annotation/annotation_{batch}/seqkit.length",
-        bed_file="annotation/annotation_{batch}/seqkit.bed",
-        bamfile="annotation/annotation_{batch}/aligned_reads.bam",
+        length_file=f"{config['sample']}_metaerg.seqkit.length",
+        bed_file=f"{config['sample']}_metaerg.seqkit.bed",
+        bamfile="aligned_reads.bam",
     output:
-        coverage="annotation/annotation_{batch}/annotated_gene_coverage.txt",
+        coverage=f"{config['sample']}_metaerg.annotated_gene_coverage.txt",
     container:
         config["docker_bedtools"]
     shell:
@@ -374,13 +401,13 @@ rule bedtools_coverage:
 
 rule create_RPM_counts:
     input:
-        coverage="annotation/annotation_{batch}/annotated_gene_coverage.txt",
-        overview="cazi_db_scan/{batch}/overview.txt",
-        substrate="cazi_db_scan/{batch}/substrate.out",
-        cgc="cazi_db_scan/{batch}/cgc.out",
+        coverage=f"{config['sample']}_metaerg.annotated_gene_coverage.txt",
+        overview=f"{config['sample']}_cazi_overview.txt",
+        substrate=f"{config['sample']}_cazi_substrate.out",
+        cgc=f"{config['sample']}_cazi_cgc.out",
         r1=config["R1"],
     output:
-        rpm_file="cazi_db_scan/{batch}/annoted_cazymes_RPM.tsv",
+        rpm_file=f"{config['sample']}_annotated_cazymes_RPM.tsv",
     conda:
         "../envs/annotate_output_parse.yaml"
     script:
@@ -392,12 +419,10 @@ rule join_CAZI:
         overview=expand("cazi_db_scan/{batch}/overview.txt", batch=BATCHES),
         substrate=expand("cazi_db_scan/{batch}/substrate.out", batch=BATCHES),
         cgc=expand("cazi_db_scan/{batch}/cgc.out", batch=BATCHES),
-        rpm=expand("cazi_db_scan/{batch}/annoted_cazymes_RPM.tsv", batch=BATCHES),
     output:
         overview=f"{config['sample']}_cazi_overview.txt",
         substrate=f"{config['sample']}_cazi_substrate.out",
         cgc=f"{config['sample']}_cazi_cgc.out",
-        rpm=f"{config['sample']}_annotated_cazymes_RPM.tsv",
     shell:
         """
         join_files(){{
@@ -415,5 +440,4 @@ rule join_CAZI:
         join_files {output.overview} {input.overview}
         join_files {output.substrate} {input.substrate}
         join_files {output.cgc} {input.cgc}
-        join_files {output.rpm} {input.rpm}
         """
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -41,7 +41,7 @@ def make_shard_names(nshards):
 def make_assembly_split_names(nparts):
     split_names = []
     # deal with the 3 digit ones first
-    for i in range(1, min(nparts, 100)):
+    for i in range(1, min(nparts + 1, 100)):
         split_names.append(f"{i:03}")
     # values after 99 are not padded
     if nparts > 99: