forked from dstern/grepfqparse
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgrepfqparser.py
More file actions
151 lines (123 loc) · 4.88 KB
/
Copy pathgrepfqparser.py
File metadata and controls
151 lines (123 loc) · 4.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/env python
"""
Parse fastq file containing inline barcodes into separate files
DEPENDENCIES
Python 2.7
USAGE
python grepfqparser.py <input_fastq> <barcode_file> <output_folder>
e.g.
python grepfqparser.py test.fq bc.txt parsed_files
gzipped files will be detected automatically (by .gz suffix) and gunzipped prior to parsing.
This is much faster than parsing on the gzipped files.
The gunzipped file is deleted after parsing.
David L. Stern
Janelia Farm Research Campus
28 May 2013
/*
* Copyright 2013 Howard Hughes Medical Institute.
* All rights reserved.
* Use is subject to Janelia Farm Research Campus Software Copyright 1.1
* license terms ( http://license.janelia.org/license/jfrc_copyright_1_1.html ).
*/
"""
import os,sys
import getopt
import subprocess
def main():
#parse command line options
try:
opts, arg = getopt.getopt(sys.argv[1:],"ht:", ["help"])
except getopt.error, msg:
print msg
print "for help use --help"
sys.exit(2)
# process options
offset = 0
for o, a in opts:
if o == '-t':
offset = int(a)
print "using offset", offset
if o in ("-h", "--help"):
print __doc__
sys.exit(0)
if len(arg) < 3:
print "\nUsage: python grepfqparser.py [options] <input_fastq> <barcode_file> <output_folder>\m"
sys.exit(0)
#process arguments
fqFile = arg[0]
bcFile = arg[1]
OutFolder = arg[2]
if len(arg) > 3:
gzbool = arg[3]
gzbool = gzbool.upper()
else:
gzbool = "YES"
if os.path.isdir(OutFolder):
print "Directory %s exists" %(OutFolder)
else:
os.mkdir(OutFolder)
print "fastq file = %s" %(fqFile)
print "barcode file = %s" %(bcFile)
print "Folder of parsed sequences = %s" %(OutFolder)
y = fqFile.split('.')
if y[-1] == 'gz':
gzbool = "YES"
else:
gzbool = "NO"
print "fq file gzipped? = %s" %(gzbool)
if gzbool == "YES":
print "unzipping file"
tempfq = open("tempfq",'w')
errlog = open("errlog1",'w')
cmd = 'gunzip -c %s' % (fqFile)
subprocess.check_call(cmd,shell=True,stdout=tempfq,stderr=errlog)
fqFile = "tempfq"
tempfq.close()
errlog.close()
bc = open(bcFile,'r')
for line in bc:
lineItems = line.split()
barcode = lineItems[0]
barcode_up = barcode.upper()
name = lineItems[1]
print barcode
parsed_file_name = str(OutFolder + "/indiv" + name + "_" + barcode)
parsed_file = open(parsed_file_name,'w')
errlog = open("errlog2",'w')
#First grep finds lines starting with the barcode and includes the line above, and two lines below each match
#Pipe into grep again to filter out -- between matches which some (versions??) of grep insert
#Pipe into sed to remove barcodes (optional offset) and associated quality scores from each line
cmd = """grep -B 1 -A 2 ^%s %s | grep -v "^--$" | sed '2~2s/^%s//g'""" % (barcode_up, fqFile, '.'*(len(barcode_up)+offset))
try:
failed = subprocess.call(cmd, shell=True,stdout=parsed_file, stderr=errlog)
finally:
errlog.close()
parsed_file.close()
bc.close()
"""Now collect all unparsed reads"""
bc = open(bcFile,'r')
allbc = []
for line in bc:
lineItems = line.split()
barcode = lineItems[0]
allbc.append(barcode.upper())
bc.close()
"""save barcodes in new file -- bconly -- with caret at front, use this as search file"""
output_bc_file = open("bcOnly",'w')
for line in allbc:
output_bc_file.write("^" + line + "\n")
output_bc_file.close()
bconly = "bcOnly"
nomatch_file = open(OutFolder + "/nomatches",'w')
errlog = open("errlog4",'w')
cmd = "awk 'NR%%4==2' %s | grep -f %s -v" % (fqFile, bconly)
try:
nomatch = subprocess.call(cmd, shell=True, stdout=nomatch_file,stderr=errlog)
finally:
errlog.close()
nomatch_file.close()
"""delete tempfq, the gunzipped original file"""
cmd = 'rm tempfq bcOnly errlog1 errlog2 errlog3 errlog4'
subprocess.call(cmd,shell=True)
if __name__ == "__main__":
sys.exit(main())