Skip to content

Commit d7aa394

Browse files
committed
Update rfblast.py
Add Dockerfile, poetry Move to new folder Add step to perform BLAST search on invalid sequences
1 parent dbd38f9 commit d7aa394

File tree

7 files changed

+744
-266
lines changed

7 files changed

+744
-266
lines changed

scripts/rfblast/Dockerfile

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
FROM python:3.9
2+
3+
ENV RNA /usr/src/rfam
4+
ENV INFERNAL /usr/src/infernal
5+
6+
RUN mkdir $INFERNAL
7+
8+
WORKDIR $RNA
9+
COPY . .
10+
COPY pyproject.toml $RNA
11+
12+
ENV PYTHONPATH=${PYTHONPATH}:${PWD}
13+
14+
# Install Infernal
15+
RUN \
16+
cd $INFERNAL && \
17+
curl -OL http://eddylab.org/infernal/infernal-1.1.4.tar.gz && \
18+
tar -xvzf infernal-1.1.4.tar.gz && \
19+
cd infernal-1.1.4 && \
20+
./configure --prefix=$INFERNAL/infernal-1.1.4 && \
21+
make && \
22+
make install && \
23+
cd easel && \
24+
make install && \
25+
cd $INFERNAL && \
26+
rm infernal-1.1.4.tar.gz
27+
28+
ENV PATH="$INFERNAL/infernal-1.1.4/bin:$RNA:$PATH"
29+
30+
RUN pip3 install poetry
31+
RUN poetry config virtualenvs.create false
32+
RUN poetry install --no-dev
33+
34+
ENTRYPOINT ["/bin/bash"]

scripts/rfblast/README.md

Whitespace-only changes.

scripts/rfblast/invalid.fa

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
>1_MN977327
2+
CCGACCCCCGGCGCAGGUCACGCAAUUUGGGUAAGGUCAUCGAUACCCUCACGUGUGGCU
3+
UCGCCGACCUCAUGGGGUAC

scripts/rfblast/poetry.lock

+372
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

scripts/rfblast/pyproject.toml

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
[tool.poetry]
2+
name = "rfblast"
3+
version = "0.1.0"
4+
description = ""
5+
authors = ["Emma Cooke <[email protected]>"]
6+
readme = "README.md"
7+
8+
[tool.poetry.dependencies]
9+
python = "^3.9"
10+
requests = "^2.31.0"
11+
click = "^8.1.7"
12+
biopython = "^1.81"
13+
xmltodict = "^0.13.0"
14+
15+
16+
[tool.poetry.group.dev.dependencies]
17+
black = "^23.7.0"
18+
19+
[build-system]
20+
requires = ["poetry-core>=1.0.0"]
21+
build-backend = "poetry.core.masonry.api"

scripts/rfblast/rfblast.py

+314
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,314 @@
1+
"""
2+
Usage:
3+
4+
# navigate to the folder with the SEED and CM files
5+
cd /path/to/folder/with/SEED
6+
7+
# identify invalid accessions that need to be replaced
8+
rfblast.py validate
9+
10+
# manually upload the file `invalid.fa` to NCBI BLAST
11+
# download results in the `Single-file JSON` format
12+
13+
# replace invalid accessions in the SEED file
14+
rfblast.py replace XXXXXXXXXXX-Alignment.json
15+
16+
# where XXXXXXXXXXX-Alignment.json is the NCBI BLAST result.
17+
"""
18+
19+
import json
20+
import os
21+
import re
22+
import xmltodict
23+
24+
import click
25+
import requests
26+
27+
from Bio.Blast import NCBIWWW
28+
from Bio import SeqIO
29+
30+
31+
IDENTITY = 90
32+
QUERY_COVERAGE = 70
33+
TEMPDIR = "temp"
34+
35+
36+
def get_accession(gid):
37+
"""
38+
Get versioned accession, for example:
39+
Input:
40+
gi|2047803076|gb|CP061286.1|
41+
Output:
42+
CP061286.1
43+
"""
44+
parts = gid.split("|")
45+
return parts[-2]
46+
47+
48+
def get_blast_data(filename):
49+
"""
50+
Load BLAST JSON output.
51+
"""
52+
with open(filename, "r") as f:
53+
return json.load(f)
54+
55+
56+
def choose_replacement(data, min_identity, min_query_coverage):
57+
"""
58+
Loop over BLAST results and pick best replacement for each hit.
59+
"""
60+
# do not pick replacement from the same accession if already seen
61+
fasta = ""
62+
seen_accessions = set()
63+
for query_num, search in enumerate(data["BlastOutput2"]):
64+
query_title = search["report"]["results"]["search"]["query_title"]
65+
query_len = search["report"]["results"]["search"]["query_len"]
66+
replacement_found = False
67+
for entry in search["report"]["results"]["search"]["hits"]:
68+
acc = get_accession(entry["description"][0]["id"])
69+
if acc not in seen_accessions:
70+
seen_accessions.add(acc)
71+
replacement_found = True
72+
else:
73+
continue
74+
sequence = entry["hsps"][0]["hseq"]
75+
start = entry["hsps"][0]["hit_from"]
76+
end = entry["hsps"][0]["hit_to"]
77+
align_len = entry["hsps"][0]["align_len"]
78+
gaps = entry["hsps"][0]["gaps"]
79+
exact_matches = entry["hsps"][0]["identity"]
80+
identity = float(exact_matches) / align_len * 100
81+
query_coverage = float(align_len - gaps) / query_len * 100
82+
target_coverage = float(align_len - gaps) / len(sequence) * 100
83+
if identity >= min_identity and query_coverage >= min_query_coverage:
84+
warning = False
85+
else:
86+
warning = True
87+
summary = (
88+
"#{query_num} {message} {query_title} "
89+
"with {acc}/{start}-{end} at {identity}% identity; "
90+
"{gaps} gaps; query coverage {query_coverage}"
91+
).format(
92+
acc=acc,
93+
start=start,
94+
end=end,
95+
query_title=query_title,
96+
identity=round(identity),
97+
query_coverage=round(query_coverage),
98+
target_coverage=round(target_coverage, 2),
99+
gaps=gaps,
100+
message="Replace"
101+
if not warning
102+
else " WARNING: No replacement found for",
103+
query_num=query_num + 1,
104+
)
105+
print(summary)
106+
if not warning:
107+
fasta += ">{acc}/{start}-{end}\n{sequence}\n".format(
108+
acc=acc,
109+
start=start,
110+
end=end,
111+
sequence=sequence.replace("-", "").replace("T", "U"),
112+
)
113+
if replacement_found:
114+
break
115+
return fasta
116+
117+
118+
def generate_new_seed(fasta):
119+
filename = "replacement.fasta"
120+
with open(filename, "w") as f:
121+
f.write(fasta)
122+
if not os.path.exists("CM"):
123+
cmd = "rfsearch.pl -t 30 -nodesc -relax"
124+
os.system(cmd)
125+
if not os.path.exists("SEED"):
126+
raise Exception("Error: SEED file not found")
127+
cmd = (
128+
"cmalign --mapali SEED --noprob CM {} > tempseed && "
129+
"esl-reformat pfam tempseed > NEWSEEDtemp && "
130+
"rm tempseed"
131+
).format(filename)
132+
os.system(cmd)
133+
invalid = set()
134+
with open("invalid.txt", "r") as f:
135+
for line in f:
136+
invalid.add(line.strip())
137+
newseed = open("NEWSEED", "w")
138+
with open("NEWSEEDtemp", "r") as f:
139+
for line in f:
140+
skip = False
141+
for invalid_accession in invalid:
142+
if invalid_accession in line:
143+
skip = True
144+
break
145+
if not skip:
146+
newseed.write(line)
147+
newseed.close()
148+
os.remove("NEWSEEDtemp")
149+
cmd = (
150+
'echo "Old SEED info:" && esl-alistat SEED && '
151+
'echo "New SEED info:" && esl-alistat NEWSEED'
152+
)
153+
os.system(cmd)
154+
155+
156+
def is_valid_accession(accession):
157+
"""
158+
TB03JUN2009E__Contig_2000/988-772
159+
NZ_CP007501.1/771730-771924
160+
161+
Found:
162+
{"header":{"type":"esearch","version":"0.3"},"esearchresult":{"count":"1","retmax":"1","retstart":"0","idlist":["1119664412"],"translationset":[],"querytranslation":""}}
163+
Found but a different ID:
164+
{"header":{"type":"esearch","version":"0.3"},"esearchresult":{"count":"1","retmax":"1","retstart":"0","idlist":["EP994606.1"],"translationset":[],"translationstack":[{"term":"JCVI_SCAF_1096627298421[All Fields]","field":"All Fields","count":"1","explode":"N"},"GROUP"],"querytranslation":"JCVI_SCAF_1096627298421[All Fields]"}}
165+
Not found:
166+
{"header":{"type":"esearch","version":"0.3"},"esearchresult":{"count":"0","retmax":"0","retstart":"0","idlist":[],"translationset":[],"querytranslation":"(TB03JUN2009E__Contig_2000[All Fields])","errorlist":{"phrasesnotfound":["TB03JUN2009E__Contig_2000"],"fieldsnotfound":[]},"warninglist":{"phrasesignored":[],"quotedphrasesnotfound":[],"outputmessages":["No items found."]}}}
167+
"""
168+
parts = accession.split("/")
169+
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=nucleotide&term={}&retmode=json&idtype=acc"
170+
r = requests.get(url.format(parts[0]))
171+
data = r.json()
172+
if (
173+
int(data["esearchresult"]["count"]) == 1
174+
and len(data["esearchresult"]["idlist"]) > 0
175+
and data["esearchresult"]["idlist"][0] == parts[0]
176+
):
177+
return True
178+
else:
179+
return False
180+
181+
182+
def fetch_seqs(filename, accessions, label):
183+
""" """
184+
f_txt = "{}.txt".format(label)
185+
f_fa = "{}.fa".format(label)
186+
with open(f_txt, "w") as f:
187+
for accession in accessions:
188+
f.write(accession + "\n")
189+
cmd = "esl-sfetch -f {} {} > {}".format(filename, f_txt, f_fa)
190+
os.system(cmd)
191+
click.echo("Saved {} {} accessions in {}".format(len(accessions), label, f_fa))
192+
193+
194+
def parse_fasta(filename):
195+
""" """
196+
accessions = set()
197+
with open(filename, "r") as f:
198+
for line in f:
199+
if line.startswith(">"):
200+
parts = re.split(r"\s+", line)
201+
accession = parts[0].replace(">", "")
202+
accessions.add(accession)
203+
valid = set()
204+
invalid = set()
205+
for accession in accessions:
206+
if is_valid_accession(accession):
207+
click.echo("{} is valid".format(accession))
208+
valid.add(accession)
209+
else:
210+
click.echo("{} is invalid".format(accession))
211+
invalid.add(accession)
212+
os.system("esl-sfetch --index {}".format(filename))
213+
if valid:
214+
fetch_seqs(filename, valid, "valid")
215+
else:
216+
click.echo("No valid accessions found")
217+
if invalid:
218+
fetch_seqs(filename, invalid, "invalid")
219+
click.echo("=========================\nGenerated file invalid.fa")
220+
os.system("esl-seqstat {}".format("invalid.fa"))
221+
click.echo("Upload file invalid.fa to NCBI BLAST")
222+
else:
223+
click.echo("No invalid accessions found")
224+
225+
226+
@click.group()
227+
def cli():
228+
pass
229+
230+
231+
def validate(seed):
232+
"""
233+
Convert SEED to a fasta file containing sequences with unknown IDs.
234+
"""
235+
if not os.path.exists(seed):
236+
raise Exception("Error: SEED does not exist")
237+
fasta = "seed.fasta"
238+
cmd = "esl-reformat fasta {} > {}".format(seed, fasta)
239+
os.system(cmd)
240+
print(fasta)
241+
parse_fasta(fasta)
242+
243+
244+
@cli.command()
245+
@click.argument("invalid", type=click.Path(exists=True))
246+
def blast_invalid_sequences(invalid, blast_program="blastn"):
247+
"""
248+
Upload the file `invalid.fa` to NCBI BLAST and download results in the `Single-file JSON` format
249+
"""
250+
251+
blast_results = []
252+
fasta_file = "invalid.fa"
253+
sequences = SeqIO.parse(fasta_file, "fasta")
254+
255+
for seq in sequences:
256+
result_handle = NCBIWWW.qblast("blastn", "nt", seq.seq)
257+
258+
blast_results.append(result_handle.read())
259+
260+
output_file = "blast_results.xml"
261+
with open(output_file, "w") as output:
262+
for res in blast_results:
263+
output.write(res)
264+
265+
with open(output_file, "r") as xml_file:
266+
xml_data = xml_file.read()
267+
268+
xml_dict = xmltodict.parse(xml_data)
269+
json_data = json.dumps(xml_dict, indent=4)
270+
271+
with open("output.json", "w") as json_file:
272+
json_file.write(json_data)
273+
274+
print(f"BLAST search completed. Results saved to {json_file}")
275+
276+
277+
@cli.command()
278+
@click.argument("blast_json", type=click.Path(exists=True))
279+
@click.option(
280+
"--identity",
281+
default=IDENTITY,
282+
type=click.FLOAT,
283+
help="Minimum % identity between query and target",
284+
)
285+
@click.option(
286+
"--query_coverage",
287+
default=QUERY_COVERAGE,
288+
type=click.FLOAT,
289+
help="Minimum coverage of the seed sequence",
290+
)
291+
def replace(blast_json, identity, query_coverage):
292+
"""
293+
Replace unknown accessions in SEED alignment using NCBI BLAST results
294+
"""
295+
blast_data = get_blast_data(blast_json)
296+
fasta = choose_replacement(blast_data, identity, query_coverage)
297+
generate_new_seed(fasta)
298+
click.echo("Done")
299+
300+
301+
@cli.command()
302+
@click.argument("seed", type=click.Path(exists=True))
303+
def all(seed):
304+
"""
305+
Carry out all steps
306+
"""
307+
validate(seed)
308+
blast_result = blast_invalid_sequences("invalid.fa")
309+
print(blast_result)
310+
replace(blast_json=blast_result)
311+
312+
313+
if __name__ == "__main__":
314+
cli()

0 commit comments

Comments
 (0)