biobakery · wbazant · Feb 15, 2021 · Jul 27, 2021 · Nov 4, 2022 · Nov 4, 2022
diff --git a/kneaddata/bowtie2_discordant_pairs.py b/kneaddata/bowtie2_discordant_pairs.py
@@ -30,12 +30,14 @@
 
 import sys
 import os
+import logging
+logger=logging.getLogger(__name__)
 
 try:
     import argparse
 except ImportError:
     sys.exit("Please upgrade to python 2.7+")
-    
+
 import string
 import tempfile
 import subprocess
@@ -71,7 +73,17 @@ def parse_arguments(args):
         "-2",
         dest="pair2",
         help="the fastq file of pair2 reads",
-        required=True) 
+        required=True)
+    parser.add_argument(
+        "--mateIds_are_equal",
+        dest="mateIds_are_equal",
+        help="If mateIds are same or different",
+        required=True)
+    parser.add_argument(
+        "--query_mate_separator",
+        dest="query_mate_separator",
+        help="Separator between queryid and mate",
+        required=True)
     parser.add_argument(
         "-x",
         dest="index",
@@ -124,153 +136,204 @@ def parse_arguments(args):
         "--reorder",
         action="store_true",
         help="print the sequences in the same order as the input files")
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="print diagnostics")
 
     return parser.parse_args()
 
+def bowtie2_unpaired_command(args, input_fastq_path, output_sam_path, maintain_input_ordering):
 
-def run_bowtie2(bowtie2_path,pair1,pair2,orphans,database,sam,threads,options,reorder):
-    """ Run bowtie2 with the options provided """
-
-    command=[bowtie2_path]
-    # if pairs are to be run as single end, then provide them all as orphans
-    if orphans:
-        orphans=",".join([pair1,pair2,orphans])
-    else:
-        orphans=",".join([pair1,pair2])
+    # Bowtie2 from input as unpaired reads to output, using the provided index
+    command=[args.bowtie2, "-U", input_fastq_path, "-S", output_sam_path, "-x", args.index]
 
-    command+=["--threads",str(threads),"-x",database,"-S",sam,"--no-head"]
-    if orphans:
-        command+=["-U",orphans]
-    if options:
-        command+=utilities.format_options_to_list([options])
-    if reorder:
+
+    # Runtime options
+    command+=["--threads",str(args.threads)]
+    if args.bowtie2_options:
+        command+=utilities.format_options_to_list([args.bowtie2_options])
+
+    if not args.verbose:
+        command+=["--quiet"]
+
+    # Maintain ordering of the fastq, if required
+    if maintain_input_ordering:
         command+=["--reorder"]
 
+    return command
+
+def run_command(command, **kwargs):
     try:
-        return_code=subprocess.check_call(command)
+        return_code=subprocess.check_call(command, **kwargs)
     except (EnvironmentError, subprocess.CalledProcessError) as e:
-        message="Unable to run bowtie2: " +" ".join(command)
+        message="Unable to run: " +" ".join(command)
         if hasattr(e, 'output') and e.output:
-            message+="\nError message returned from bowtie2:\n" + e.output
+            message+="\nError message returned:\n" + e.output
         sys.exit(message)
 
-def organize_alignments_single(sam,open_files,counts,mode):
-    """ Organize the alignments that were generated running the pairs as single end reads """
-
-    # read through the sam file, organizing reads by those that aligned 
-    aligned={}
-    unaligned={}
-    with open(sam) as file_handle:
-        for line in file_handle:
-            data=line.rstrip().split("\t")
-            flag=int(data[1])
-
-            query_id = data[0][:-1]
-            pair1 = False if data[0][-1] == "2" else True
-
-            # check if the read aligned
-            if flag & 4:
-                # this read did not align to the reference
-                if not query_id in unaligned:
-                    unaligned[query_id]=set()
-                unaligned[query_id].add(pair1)
+def read_sam_line(line,query_mate_separator):
+    data=line.rstrip().split("\t")
+    separatorLocation = data[0].find(query_mate_separator)
+    query_id = data[0][:separatorLocation]
+    mate = data[0][separatorLocation+1]
+    is_aligned = not (int(data[1]) & 4)
+    read="\n".join(["@"+data[0], data[9], "+", data[10], ""])
+    return (query_id, mate, is_aligned, read)
+
+def process_alignments(pair1_sam, pair2_sam, orphan_sam, aligned_pair, unaligned_pair, aligned_orphan, unaligned_orphan, mateIds_are_equal, query_mate_separator, treat_pair_as_aligned_if_either_read_aligned):
+    """ Read through the paired sam alignments and organize into the output files """
+
+    open_files={
+       1: {
+         'both_aligned': open(aligned_pair.replace("%","1"),"wt"),
+         'both_unaligned': open(unaligned_pair.replace("%","1"),"wt"),
+         'only_this_aligned': open(aligned_orphan.replace("%","1"),"wt"),
+         'only_this_unaligned': open(unaligned_orphan.replace("%","1"),"wt")
+       },
+       2: {
+         'both_aligned': open(aligned_pair.replace("%","2"),"wt"),
+         'both_unaligned': open(unaligned_pair.replace("%","2"),"wt"),
+         'only_this_aligned': open(aligned_orphan.replace("%","2"),"wt"),
+         'only_this_unaligned': open(unaligned_orphan.replace("%","2"),"wt")
+       }
+    }
+    counts={
+         'both_aligned': 0,
+         'both_unaligned': 0,
+         'only_this_aligned': 0,
+         'only_this_unaligned': 0
+    }
+
+
+    with open(pair1_sam) as fh_1, open(pair2_sam) as fh_2:
+        line_1=fh_1.readline()
+        while line_1.startswith("@"):
+            line_1=fh_1.readline()
+        line_2=fh_2.readline()
+        while line_2.startswith("@"):
+            line_2=fh_2.readline()
+        line_count = 1
+        while line_1 and line_2:
+            query_id_1, mate_1, is_aligned_1, read_1 = read_sam_line(line_1,query_mate_separator)
+            query_id_2, mate_2, is_aligned_2, read_2 = read_sam_line(line_2,query_mate_separator)
+            if (mateIds_are_equal=='True'):
+                if not (query_id_1 == query_id_2 and mate_1 == mate_2):
+                    raise ValueError("QueryIds: "+str(query_id_1)+","+str(query_id_1)+" mates: "+str(mate_1)+","+str(mate_2)+". Mates do not match on line.")
             else:
-                # this read aligned to the reference
-                if not query_id in aligned:
-                    aligned[query_id]=set()
-                aligned[query_id].add(pair1)
-
-    # if running in strict mode, for all pairs with a single alignment
-    # also filter out the other pair
-    if mode == "strict":
-        for query_id in aligned:
-            if len(aligned[query_id]) == 1 and len(unaligned.get(query_id,[])) == 1:
-                aligned[query_id].update(unaligned[query_id])
-                unaligned[query_id]=set()
-
-    # read through the sam file again to write the reads to the output files
-    with open(sam) as file_handle:
-        for line in file_handle:
-            data=line.rstrip().split("\t")
-
-            query_id = data[0][:-1]
-            pair1 = False if data[0][-1] == "2" else True
-
-            # check the alignment type of this query
-            align_values = aligned.get(query_id,[])
-            unalign_values = unaligned.get(query_id,[])
-
-            if len(align_values) > 1:
-                # both reads in the pair aligned to the reference
-                file_name = "pair1_aligned" if pair1 else "pair2_aligned"
-            elif len(unalign_values) > 1:
-                # both reads did not align to the reference
-                file_name = "pair1_unaligned" if pair1 else "pair2_unaligned"
-            elif pair1 in align_values:
-                # only this read from the pair aligned to the reference
-                file_name = "orphan1_aligned" if pair1 else "orphan2_aligned"
-            elif pair1 in unalign_values:
-                # only this read from the pair did not align to the reference
-                file_name = "orphan1_unaligned" if pair1 else "orphan2_unaligned"
-
-            # write the read to the file
-            open_files[file_name].write("\n".join(["@"+data[0],data[9],"+",data[10]])+"\n")
-            # increase the count
-            counts[file_name]=counts[file_name]+1                
-
+                if not (query_id_1 == query_id_2 and mate_1 != mate_2):
+                    raise ValueError("QueryIds: "+str(query_id_1)+","+str(query_id_1)+" mates: "+str(mate_1)+","+str(mate_2)+". Mates match.")
+
+            aa = is_aligned_1 and is_aligned_2 or (treat_pair_as_aligned_if_either_read_aligned and (is_aligned_1 or is_aligned_2))
 
-def process_alignments(sam,aligned_pair,unaligned_pair,aligned_orphan,unaligned_orphan,mode):
-    """ Read through the sam alignments and organize into the output files """
-
-    # open the output files
-    pair1_aligned=open(aligned_pair.replace("%","1"),"wt")
-    pair2_aligned=open(aligned_pair.replace("%","2"),"wt")
-
-    pair1_unaligned=open(unaligned_pair.replace("%","1"),"wt")
-    pair2_unaligned=open(unaligned_pair.replace("%","2"),"wt")
-
-    orphan1_aligned=open(aligned_orphan.replace("%","1"),"wt")
-    orphan2_aligned=open(aligned_orphan.replace("%","2"),"wt")
-
-    orphan1_unaligned=open(unaligned_orphan.replace("%","1"),"wt")
-    orphan2_unaligned=open(unaligned_orphan.replace("%","2"),"wt")  
-
-    open_files={"pair1_aligned":pair1_aligned,"pair2_aligned":pair2_aligned,
-                "pair1_unaligned":pair1_unaligned,"pair2_unaligned":pair2_unaligned,
-                "orphan1_aligned":orphan1_aligned,"orphan2_aligned":orphan2_aligned,
-                "orphan1_unaligned":orphan1_unaligned,"orphan2_unaligned":orphan2_unaligned}
-    counts={name:0 for name in open_files.keys()}  
-
-    organize_alignments_single(sam,open_files,counts,mode)
-
-    # close all of the files
-    for file_name, file_handle in open_files.items():
-        file_handle.close()
-
-    # write out the counts for each file
-    for file_name,total in counts.items():
-        print(file_name+" : "+str(total))
-
+            x1 = 'both_aligned' if aa else 'only_this_aligned' if is_aligned_1 else 'only_this_unaligned' if is_aligned_2 else 'both_unaligned'
+            x2 = 'both_aligned' if aa else 'only_this_aligned' if is_aligned_2 else 'only_this_unaligned' if is_aligned_1 else 'both_unaligned'
+
+            counts[x1]+=1
+            open_files[1][x1].write(read_1)
+            open_files[2][x2].write(read_2)
+            line_1=fh_1.readline()
+            line_2=fh_2.readline()
+            line_count+=1
+
+        while line_1:
+            query_id_1, mate_1, is_aligned_1, read_1 = read_sam_line(line_1,query_mate_separator)
+            x1 = 'only_this_aligned' if is_aligned_1 else 'only_this_unaligned'
+            counts[x1]+=1
+            open_files[1][x1].write(read_1)
+            line_1=fh_1.readline()
+
+        while line_2:
+            query_id_2, mate_2, is_aligned_2, read_2 = read_sam_line(line_2, query_mate_separator)
+            x2 = 'only_this_aligned' if is_aligned_2 else 'only_this_unaligned'
+            counts[x2]+=2
+            open_files[2][x2].write(read_2)
+            line_2=fh_2.readline()
+
+    logger.info("Paired reads: {0} both aligned, {1} both unaligned, {2} only pair1 aligned, {3} only pair2 aligned"
+     .format(counts["both_aligned"], counts["both_unaligned"], counts["only_this_aligned"], counts["only_this_unaligned"])
+    )
+    if orphan_sam:
+        orphan_counts = {'only_this_aligned' : 0, 'only_this_unaligned': 0}
+        with open(orphan_sam) as fh:
+            line = fh.readline()
+            while line.startswith("@"):
+                line=fh.readline()
+            while line:
+                query_id, mate, is_aligned, read = read_sam_line(line,query_mate_separator)
+                m = 2 if mate == '2' else 1
+                x = 'only_this_aligned' if is_aligned else 'only_this_unaligned'
+                orphan_counts[x]+=1
+                open_files[m][x].write(read)
+                line = fh.readline()
+        logger.info("Orphan reads: {0} aligned, {1} unaligned"
+          .format(orphan_counts["only_this_aligned"], orphan_counts["only_this_unaligned"])
+        )
+
+    for fh in open_files[1].values():
+       fh.close()
+    for fh in open_files[2].values():
+       fh.close()
+
+def temp_sam_path(args, name):
+    output_dir=os.path.dirname(args.un_pair)
+    file_out, sam = tempfile.mkstemp("kneaddata_"+name+".sam", dir=output_dir)
+    os.close(file_out)
+    return sam
 
 def main():
     # parse the command line arguments
     args = parse_arguments(sys.argv)
-
-    # if no sam output is provided or it is set to dev/null, write to temp file
-    output_dir=os.path.dirname(args.un_pair)
-    temp_files=[]
-    if args.sam is None or args.sam == os.devnull:
-        file_out, args.sam = tempfile.mkstemp("kneaddata_", "_temp.sam", dir=output_dir)
-        os.close(file_out)
-        temp_files.append(args.sam)
-
-    # run bowtie2
-    run_bowtie2(args.bowtie2,args.pair1,args.pair2,args.orphan,args.index,args.sam,args.threads,args.bowtie2_options,args.reorder)
-
-    # write output files
-    process_alignments(args.sam,args.al_pair,args.un_pair,args.al_single,args.un_single,args.mode)
-
-    # remove the temp files
+    temp_files = []
+
+    ch = logging.StreamHandler()
+    ch.setFormatter(logging.Formatter('%(asctime)s kneaddata_bowtie2_discordant_pairs - %(message)s'))
+    logger.addHandler(ch)
+    if args.verbose:
+        logger.setLevel(logging.DEBUG)
+    else:
+        logger.setLevel(logging.INFO)
+
+    logger.debug("info")
+    pair1_sam=temp_sam_path(args, "pair1")
+    temp_files.append(pair1_sam)
+    command=bowtie2_unpaired_command(args, input_fastq_path = args.pair1, output_sam_path = pair1_sam, maintain_input_ordering = True)
+    logger.debug("Aligning pair1 reads: "+ " ".join(command))
+    run_command(command)
+
+    pair2_sam=temp_sam_path(args, "pair2")
+    temp_files.append(pair2_sam)
+    command=bowtie2_unpaired_command(args, input_fastq_path = args.pair2, output_sam_path = pair2_sam, maintain_input_ordering = True)
+    logger.debug("Aligning pair2 reads: "+ " ".join(command))
+    run_command(command)
+
+    orphan_sam = None
+    if args.orphan:
+        orphan_sam=temp_sam_path(args, "orphan")
+        temp_files.append(orphan_sam)
+        command=bowtie2_unpaired_command(args, input_fastq_path = args.orphan, output_sam_path = orphan_sam, maintain_input_ordering = args.reorder)
+        logger.debug("Aligning orphan reads: "+ " ".join(command))
+        run_command(command)
+
+    logger.debug("Processing the alignments")
+    process_alignments(pair1_sam, pair2_sam, orphan_sam,
+      aligned_pair = args.al_pair,
+      unaligned_pair = args.un_pair,
+      aligned_orphan = args.al_single,
+      unaligned_orphan = args.un_single,
+      mateIds_are_equal = args.mateIds_are_equal,
+      query_mate_separator = args.query_mate_separator,
+      treat_pair_as_aligned_if_either_read_aligned = (args.mode == "strict")
+    )
+
+    if args.sam and args.sam != os.devnull:
+        logger.debug("Aggregating .sam output to " + args.sam)
+        command = ["cat"]
+        command.extend(temp_files)
+        with open(args.sam, 'w') as fh: 
+            run_command(command, stdout = fh)
+
+    logger.debug("Removing temporary files")
     for file in temp_files:
         utilities.remove_file(file)
 

diff --git a/kneaddata/knead_data.py b/kneaddata/knead_data.py
@@ -144,6 +144,16 @@ def parse_arguments(args):
         action="store_true",
         dest='bmtagger',
         help="run BMTagger instead of Bowtie2 to identify contaminant reads")
+    group1.add_argument(
+        "--mateIds_are_equal",
+        choices=('True','False'),
+        dest='mateIds_are_equal',
+        help="Are mates in paired reads equal or different. I.e. 1 and 1 or 1 and 2")
+    group1.add_argument(
+        "--query_mate_separator",
+        dest='query_mate_separator',
+        help="Seperator between queryid and mate",
+        required=True)
     group1.add_argument(
         "--bypass-trf",
         action="store_true",