Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use read pairing information in kneaddata_bowtie2_discordant_pairs #16

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
325 changes: 194 additions & 131 deletions kneaddata/bowtie2_discordant_pairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,14 @@

import sys
import os
import logging
logger=logging.getLogger(__name__)

try:
import argparse
except ImportError:
sys.exit("Please upgrade to python 2.7+")

import string
import tempfile
import subprocess
Expand Down Expand Up @@ -71,7 +73,17 @@ def parse_arguments(args):
"-2",
dest="pair2",
help="the fastq file of pair2 reads",
required=True)
required=True)
parser.add_argument(
"--mateIds_are_equal",
dest="mateIds_are_equal",
help="If mateIds are same or different",
required=True)
parser.add_argument(
"--query_mate_separator",
dest="query_mate_separator",
help="Separator between queryid and mate",
required=True)
parser.add_argument(
"-x",
dest="index",
Expand Down Expand Up @@ -124,153 +136,204 @@ def parse_arguments(args):
"--reorder",
action="store_true",
help="print the sequences in the same order as the input files")
parser.add_argument(
"--verbose",
action="store_true",
help="print diagnostics")

return parser.parse_args()

def bowtie2_unpaired_command(args, input_fastq_path, output_sam_path, maintain_input_ordering):

def run_bowtie2(bowtie2_path,pair1,pair2,orphans,database,sam,threads,options,reorder):
""" Run bowtie2 with the options provided """

command=[bowtie2_path]
# if pairs are to be run as single end, then provide them all as orphans
if orphans:
orphans=",".join([pair1,pair2,orphans])
else:
orphans=",".join([pair1,pair2])
# Bowtie2 from input as unpaired reads to output, using the provided index
command=[args.bowtie2, "-U", input_fastq_path, "-S", output_sam_path, "-x", args.index]

command+=["--threads",str(threads),"-x",database,"-S",sam,"--no-head"]
if orphans:
command+=["-U",orphans]
if options:
command+=utilities.format_options_to_list([options])
if reorder:

# Runtime options
command+=["--threads",str(args.threads)]
if args.bowtie2_options:
command+=utilities.format_options_to_list([args.bowtie2_options])

if not args.verbose:
command+=["--quiet"]

# Maintain ordering of the fastq, if required
if maintain_input_ordering:
command+=["--reorder"]

return command

def run_command(command, **kwargs):
try:
return_code=subprocess.check_call(command)
return_code=subprocess.check_call(command, **kwargs)
except (EnvironmentError, subprocess.CalledProcessError) as e:
message="Unable to run bowtie2: " +" ".join(command)
message="Unable to run: " +" ".join(command)
if hasattr(e, 'output') and e.output:
message+="\nError message returned from bowtie2:\n" + e.output
message+="\nError message returned:\n" + e.output
sys.exit(message)

def organize_alignments_single(sam,open_files,counts,mode):
""" Organize the alignments that were generated running the pairs as single end reads """

# read through the sam file, organizing reads by those that aligned
aligned={}
unaligned={}
with open(sam) as file_handle:
for line in file_handle:
data=line.rstrip().split("\t")
flag=int(data[1])

query_id = data[0][:-1]
pair1 = False if data[0][-1] == "2" else True

# check if the read aligned
if flag & 4:
# this read did not align to the reference
if not query_id in unaligned:
unaligned[query_id]=set()
unaligned[query_id].add(pair1)
def read_sam_line(line,query_mate_separator):
data=line.rstrip().split("\t")
separatorLocation = data[0].find(query_mate_separator)
query_id = data[0][:separatorLocation]
mate = data[0][separatorLocation+1]
is_aligned = not (int(data[1]) & 4)
read="\n".join(["@"+data[0], data[9], "+", data[10], ""])
return (query_id, mate, is_aligned, read)

def process_alignments(pair1_sam, pair2_sam, orphan_sam, aligned_pair, unaligned_pair, aligned_orphan, unaligned_orphan, mateIds_are_equal, query_mate_separator, treat_pair_as_aligned_if_either_read_aligned):
""" Read through the paired sam alignments and organize into the output files """

open_files={
1: {
'both_aligned': open(aligned_pair.replace("%","1"),"wt"),
'both_unaligned': open(unaligned_pair.replace("%","1"),"wt"),
'only_this_aligned': open(aligned_orphan.replace("%","1"),"wt"),
'only_this_unaligned': open(unaligned_orphan.replace("%","1"),"wt")
},
2: {
'both_aligned': open(aligned_pair.replace("%","2"),"wt"),
'both_unaligned': open(unaligned_pair.replace("%","2"),"wt"),
'only_this_aligned': open(aligned_orphan.replace("%","2"),"wt"),
'only_this_unaligned': open(unaligned_orphan.replace("%","2"),"wt")
}
}
counts={
'both_aligned': 0,
'both_unaligned': 0,
'only_this_aligned': 0,
'only_this_unaligned': 0
}


with open(pair1_sam) as fh_1, open(pair2_sam) as fh_2:
line_1=fh_1.readline()
while line_1.startswith("@"):
line_1=fh_1.readline()
line_2=fh_2.readline()
while line_2.startswith("@"):
line_2=fh_2.readline()
line_count = 1
while line_1 and line_2:
query_id_1, mate_1, is_aligned_1, read_1 = read_sam_line(line_1,query_mate_separator)
query_id_2, mate_2, is_aligned_2, read_2 = read_sam_line(line_2,query_mate_separator)
if (mateIds_are_equal=='True'):
if not (query_id_1 == query_id_2 and mate_1 == mate_2):
raise ValueError("QueryIds: "+str(query_id_1)+","+str(query_id_1)+" mates: "+str(mate_1)+","+str(mate_2)+". Mates do not match on line.")
else:
# this read aligned to the reference
if not query_id in aligned:
aligned[query_id]=set()
aligned[query_id].add(pair1)

# if running in strict mode, for all pairs with a single alignment
# also filter out the other pair
if mode == "strict":
for query_id in aligned:
if len(aligned[query_id]) == 1 and len(unaligned.get(query_id,[])) == 1:
aligned[query_id].update(unaligned[query_id])
unaligned[query_id]=set()

# read through the sam file again to write the reads to the output files
with open(sam) as file_handle:
for line in file_handle:
data=line.rstrip().split("\t")

query_id = data[0][:-1]
pair1 = False if data[0][-1] == "2" else True

# check the alignment type of this query
align_values = aligned.get(query_id,[])
unalign_values = unaligned.get(query_id,[])

if len(align_values) > 1:
# both reads in the pair aligned to the reference
file_name = "pair1_aligned" if pair1 else "pair2_aligned"
elif len(unalign_values) > 1:
# both reads did not align to the reference
file_name = "pair1_unaligned" if pair1 else "pair2_unaligned"
elif pair1 in align_values:
# only this read from the pair aligned to the reference
file_name = "orphan1_aligned" if pair1 else "orphan2_aligned"
elif pair1 in unalign_values:
# only this read from the pair did not align to the reference
file_name = "orphan1_unaligned" if pair1 else "orphan2_unaligned"

# write the read to the file
open_files[file_name].write("\n".join(["@"+data[0],data[9],"+",data[10]])+"\n")
# increase the count
counts[file_name]=counts[file_name]+1

if not (query_id_1 == query_id_2 and mate_1 != mate_2):
raise ValueError("QueryIds: "+str(query_id_1)+","+str(query_id_1)+" mates: "+str(mate_1)+","+str(mate_2)+". Mates match.")

aa = is_aligned_1 and is_aligned_2 or (treat_pair_as_aligned_if_either_read_aligned and (is_aligned_1 or is_aligned_2))

def process_alignments(sam,aligned_pair,unaligned_pair,aligned_orphan,unaligned_orphan,mode):
""" Read through the sam alignments and organize into the output files """

# open the output files
pair1_aligned=open(aligned_pair.replace("%","1"),"wt")
pair2_aligned=open(aligned_pair.replace("%","2"),"wt")

pair1_unaligned=open(unaligned_pair.replace("%","1"),"wt")
pair2_unaligned=open(unaligned_pair.replace("%","2"),"wt")

orphan1_aligned=open(aligned_orphan.replace("%","1"),"wt")
orphan2_aligned=open(aligned_orphan.replace("%","2"),"wt")

orphan1_unaligned=open(unaligned_orphan.replace("%","1"),"wt")
orphan2_unaligned=open(unaligned_orphan.replace("%","2"),"wt")

open_files={"pair1_aligned":pair1_aligned,"pair2_aligned":pair2_aligned,
"pair1_unaligned":pair1_unaligned,"pair2_unaligned":pair2_unaligned,
"orphan1_aligned":orphan1_aligned,"orphan2_aligned":orphan2_aligned,
"orphan1_unaligned":orphan1_unaligned,"orphan2_unaligned":orphan2_unaligned}
counts={name:0 for name in open_files.keys()}

organize_alignments_single(sam,open_files,counts,mode)

# close all of the files
for file_name, file_handle in open_files.items():
file_handle.close()

# write out the counts for each file
for file_name,total in counts.items():
print(file_name+" : "+str(total))

x1 = 'both_aligned' if aa else 'only_this_aligned' if is_aligned_1 else 'only_this_unaligned' if is_aligned_2 else 'both_unaligned'
x2 = 'both_aligned' if aa else 'only_this_aligned' if is_aligned_2 else 'only_this_unaligned' if is_aligned_1 else 'both_unaligned'

counts[x1]+=1
open_files[1][x1].write(read_1)
open_files[2][x2].write(read_2)
line_1=fh_1.readline()
line_2=fh_2.readline()
line_count+=1

while line_1:
query_id_1, mate_1, is_aligned_1, read_1 = read_sam_line(line_1,query_mate_separator)
x1 = 'only_this_aligned' if is_aligned_1 else 'only_this_unaligned'
counts[x1]+=1
open_files[1][x1].write(read_1)
line_1=fh_1.readline()

while line_2:
query_id_2, mate_2, is_aligned_2, read_2 = read_sam_line(line_2, query_mate_separator)
x2 = 'only_this_aligned' if is_aligned_2 else 'only_this_unaligned'
counts[x2]+=2
open_files[2][x2].write(read_2)
line_2=fh_2.readline()

logger.info("Paired reads: {0} both aligned, {1} both unaligned, {2} only pair1 aligned, {3} only pair2 aligned"
.format(counts["both_aligned"], counts["both_unaligned"], counts["only_this_aligned"], counts["only_this_unaligned"])
)
if orphan_sam:
orphan_counts = {'only_this_aligned' : 0, 'only_this_unaligned': 0}
with open(orphan_sam) as fh:
line = fh.readline()
while line.startswith("@"):
line=fh.readline()
while line:
query_id, mate, is_aligned, read = read_sam_line(line,query_mate_separator)
m = 2 if mate == '2' else 1
x = 'only_this_aligned' if is_aligned else 'only_this_unaligned'
orphan_counts[x]+=1
open_files[m][x].write(read)
line = fh.readline()
logger.info("Orphan reads: {0} aligned, {1} unaligned"
.format(orphan_counts["only_this_aligned"], orphan_counts["only_this_unaligned"])
)

for fh in open_files[1].values():
fh.close()
for fh in open_files[2].values():
fh.close()

def temp_sam_path(args, name):
output_dir=os.path.dirname(args.un_pair)
file_out, sam = tempfile.mkstemp("kneaddata_"+name+".sam", dir=output_dir)
os.close(file_out)
return sam

def main():
# parse the command line arguments
args = parse_arguments(sys.argv)

# if no sam output is provided or it is set to dev/null, write to temp file
output_dir=os.path.dirname(args.un_pair)
temp_files=[]
if args.sam is None or args.sam == os.devnull:
file_out, args.sam = tempfile.mkstemp("kneaddata_", "_temp.sam", dir=output_dir)
os.close(file_out)
temp_files.append(args.sam)

# run bowtie2
run_bowtie2(args.bowtie2,args.pair1,args.pair2,args.orphan,args.index,args.sam,args.threads,args.bowtie2_options,args.reorder)

# write output files
process_alignments(args.sam,args.al_pair,args.un_pair,args.al_single,args.un_single,args.mode)

# remove the temp files
temp_files = []

ch = logging.StreamHandler()
ch.setFormatter(logging.Formatter('%(asctime)s kneaddata_bowtie2_discordant_pairs - %(message)s'))
logger.addHandler(ch)
if args.verbose:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.INFO)

logger.debug("info")
pair1_sam=temp_sam_path(args, "pair1")
temp_files.append(pair1_sam)
command=bowtie2_unpaired_command(args, input_fastq_path = args.pair1, output_sam_path = pair1_sam, maintain_input_ordering = True)
logger.debug("Aligning pair1 reads: "+ " ".join(command))
run_command(command)

pair2_sam=temp_sam_path(args, "pair2")
temp_files.append(pair2_sam)
command=bowtie2_unpaired_command(args, input_fastq_path = args.pair2, output_sam_path = pair2_sam, maintain_input_ordering = True)
logger.debug("Aligning pair2 reads: "+ " ".join(command))
run_command(command)

orphan_sam = None
if args.orphan:
orphan_sam=temp_sam_path(args, "orphan")
temp_files.append(orphan_sam)
command=bowtie2_unpaired_command(args, input_fastq_path = args.orphan, output_sam_path = orphan_sam, maintain_input_ordering = args.reorder)
logger.debug("Aligning orphan reads: "+ " ".join(command))
run_command(command)

logger.debug("Processing the alignments")
process_alignments(pair1_sam, pair2_sam, orphan_sam,
aligned_pair = args.al_pair,
unaligned_pair = args.un_pair,
aligned_orphan = args.al_single,
unaligned_orphan = args.un_single,
mateIds_are_equal = args.mateIds_are_equal,
query_mate_separator = args.query_mate_separator,
treat_pair_as_aligned_if_either_read_aligned = (args.mode == "strict")
)

if args.sam and args.sam != os.devnull:
logger.debug("Aggregating .sam output to " + args.sam)
command = ["cat"]
command.extend(temp_files)
with open(args.sam, 'w') as fh:
run_command(command, stdout = fh)

logger.debug("Removing temporary files")
for file in temp_files:
utilities.remove_file(file)

Expand Down
10 changes: 10 additions & 0 deletions kneaddata/knead_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,16 @@ def parse_arguments(args):
action="store_true",
dest='bmtagger',
help="run BMTagger instead of Bowtie2 to identify contaminant reads")
group1.add_argument(
"--mateIds_are_equal",
choices=('True','False'),
dest='mateIds_are_equal',
help="Are mates in paired reads equal or different. I.e. 1 and 1 or 1 and 2")
group1.add_argument(
"--query_mate_separator",
dest='query_mate_separator',
help="Seperator between queryid and mate",
required=True)
group1.add_argument(
"--bypass-trf",
action="store_true",
Expand Down
Loading