Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Rfblast #171

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions scripts/rfblast/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
FROM python:3.9

ENV RNA /usr/src/rfam
ENV INFERNAL /usr/src/infernal

RUN mkdir $INFERNAL

WORKDIR $RNA
COPY . .
COPY pyproject.toml $RNA

ENV PYTHONPATH=${PYTHONPATH}:${PWD}

# Install Infernal
RUN \
cd $INFERNAL && \
curl -OL http://eddylab.org/infernal/infernal-1.1.4.tar.gz && \
tar -xvzf infernal-1.1.4.tar.gz && \
cd infernal-1.1.4 && \
./configure --prefix=$INFERNAL/infernal-1.1.4 && \
make && \
make install && \
cd easel && \
make install && \
cd $INFERNAL && \
rm infernal-1.1.4.tar.gz

ENV PATH="$INFERNAL/infernal-1.1.4/bin:$RNA:$PATH"

RUN pip3 install poetry
RUN poetry config virtualenvs.create false
RUN poetry install --no-dev

ENTRYPOINT ["/bin/bash"]
Empty file added scripts/rfblast/README.md
Empty file.
3 changes: 3 additions & 0 deletions scripts/rfblast/invalid.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
>1_MN977327
CCGACCCCCGGCGCAGGUCACGCAAUUUGGGUAAGGUCAUCGAUACCCUCACGUGUGGCU
UCGCCGACCUCAUGGGGUAC
372 changes: 372 additions & 0 deletions scripts/rfblast/poetry.lock

Large diffs are not rendered by default.

21 changes: 21 additions & 0 deletions scripts/rfblast/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
[tool.poetry]
name = "rfblast"
version = "0.1.0"
description = ""
authors = ["Emma Cooke <[email protected]>"]
readme = "README.md"

[tool.poetry.dependencies]
python = "^3.9"
requests = "^2.31.0"
click = "^8.1.7"
biopython = "^1.81"
xmltodict = "^0.13.0"


[tool.poetry.group.dev.dependencies]
black = "^23.7.0"

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
314 changes: 314 additions & 0 deletions scripts/rfblast/rfblast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,314 @@
"""
Usage:

# navigate to the folder with the SEED and CM files
cd /path/to/folder/with/SEED

# identify invalid accessions that need to be replaced
rfblast.py validate

# manually upload the file `invalid.fa` to NCBI BLAST
# download results in the `Single-file JSON` format

# replace invalid accessions in the SEED file
rfblast.py replace XXXXXXXXXXX-Alignment.json

# where XXXXXXXXXXX-Alignment.json is the NCBI BLAST result.
"""

import json
import os
import re
import xmltodict

import click
import requests

from Bio.Blast import NCBIWWW
from Bio import SeqIO


IDENTITY = 90
QUERY_COVERAGE = 70
TEMPDIR = "temp"


def get_accession(gid):
"""
Get versioned accession, for example:
Input:
gi|2047803076|gb|CP061286.1|
Output:
CP061286.1
"""
parts = gid.split("|")
return parts[-2]


def get_blast_data(filename):
"""
Load BLAST JSON output.
"""
with open(filename, "r") as f:
return json.load(f)


def choose_replacement(data, min_identity, min_query_coverage):
"""
Loop over BLAST results and pick best replacement for each hit.
"""
# do not pick replacement from the same accession if already seen
fasta = ""
seen_accessions = set()
for query_num, search in enumerate(data["BlastOutput2"]):
query_title = search["report"]["results"]["search"]["query_title"]
query_len = search["report"]["results"]["search"]["query_len"]
replacement_found = False
for entry in search["report"]["results"]["search"]["hits"]:
acc = get_accession(entry["description"][0]["id"])
if acc not in seen_accessions:
seen_accessions.add(acc)
replacement_found = True
else:
continue
sequence = entry["hsps"][0]["hseq"]
start = entry["hsps"][0]["hit_from"]
end = entry["hsps"][0]["hit_to"]
align_len = entry["hsps"][0]["align_len"]
gaps = entry["hsps"][0]["gaps"]
exact_matches = entry["hsps"][0]["identity"]
identity = float(exact_matches) / align_len * 100
query_coverage = float(align_len - gaps) / query_len * 100
target_coverage = float(align_len - gaps) / len(sequence) * 100
if identity >= min_identity and query_coverage >= min_query_coverage:
warning = False
else:
warning = True
summary = (
"#{query_num} {message} {query_title} "
"with {acc}/{start}-{end} at {identity}% identity; "
"{gaps} gaps; query coverage {query_coverage}"
).format(
acc=acc,
start=start,
end=end,
query_title=query_title,
identity=round(identity),
query_coverage=round(query_coverage),
target_coverage=round(target_coverage, 2),
gaps=gaps,
message="Replace"
if not warning
else " WARNING: No replacement found for",
query_num=query_num + 1,
)
print(summary)
if not warning:
fasta += ">{acc}/{start}-{end}\n{sequence}\n".format(
acc=acc,
start=start,
end=end,
sequence=sequence.replace("-", "").replace("T", "U"),
)
if replacement_found:
break
return fasta


def generate_new_seed(fasta):
filename = "replacement.fasta"
with open(filename, "w") as f:
f.write(fasta)
if not os.path.exists("CM"):
cmd = "rfsearch.pl -t 30 -nodesc -relax"
os.system(cmd)
if not os.path.exists("SEED"):
raise Exception("Error: SEED file not found")
cmd = (
"cmalign --mapali SEED --noprob CM {} > tempseed && "
"esl-reformat pfam tempseed > NEWSEEDtemp && "
"rm tempseed"
).format(filename)
os.system(cmd)
invalid = set()
with open("invalid.txt", "r") as f:
for line in f:
invalid.add(line.strip())
newseed = open("NEWSEED", "w")
with open("NEWSEEDtemp", "r") as f:
for line in f:
skip = False
for invalid_accession in invalid:
if invalid_accession in line:
skip = True
break
if not skip:
newseed.write(line)
newseed.close()
os.remove("NEWSEEDtemp")
cmd = (
'echo "Old SEED info:" && esl-alistat SEED && '
'echo "New SEED info:" && esl-alistat NEWSEED'
)
os.system(cmd)


def is_valid_accession(accession):
"""
TB03JUN2009E__Contig_2000/988-772
NZ_CP007501.1/771730-771924

Found:
{"header":{"type":"esearch","version":"0.3"},"esearchresult":{"count":"1","retmax":"1","retstart":"0","idlist":["1119664412"],"translationset":[],"querytranslation":""}}
Found but a different ID:
{"header":{"type":"esearch","version":"0.3"},"esearchresult":{"count":"1","retmax":"1","retstart":"0","idlist":["EP994606.1"],"translationset":[],"translationstack":[{"term":"JCVI_SCAF_1096627298421[All Fields]","field":"All Fields","count":"1","explode":"N"},"GROUP"],"querytranslation":"JCVI_SCAF_1096627298421[All Fields]"}}
Not found:
{"header":{"type":"esearch","version":"0.3"},"esearchresult":{"count":"0","retmax":"0","retstart":"0","idlist":[],"translationset":[],"querytranslation":"(TB03JUN2009E__Contig_2000[All Fields])","errorlist":{"phrasesnotfound":["TB03JUN2009E__Contig_2000"],"fieldsnotfound":[]},"warninglist":{"phrasesignored":[],"quotedphrasesnotfound":[],"outputmessages":["No items found."]}}}
"""
parts = accession.split("/")
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=nucleotide&term={}&retmode=json&idtype=acc"
r = requests.get(url.format(parts[0]))
data = r.json()
if (
int(data["esearchresult"]["count"]) == 1
and len(data["esearchresult"]["idlist"]) > 0
and data["esearchresult"]["idlist"][0] == parts[0]
):
return True
else:
return False


def fetch_seqs(filename, accessions, label):
""" """
f_txt = "{}.txt".format(label)
f_fa = "{}.fa".format(label)
with open(f_txt, "w") as f:
for accession in accessions:
f.write(accession + "\n")
cmd = "esl-sfetch -f {} {} > {}".format(filename, f_txt, f_fa)
os.system(cmd)
click.echo("Saved {} {} accessions in {}".format(len(accessions), label, f_fa))


def parse_fasta(filename):
""" """
accessions = set()
with open(filename, "r") as f:
for line in f:
if line.startswith(">"):
parts = re.split(r"\s+", line)
accession = parts[0].replace(">", "")
accessions.add(accession)
valid = set()
invalid = set()
for accession in accessions:
if is_valid_accession(accession):
click.echo("{} is valid".format(accession))
valid.add(accession)
else:
click.echo("{} is invalid".format(accession))
invalid.add(accession)
os.system("esl-sfetch --index {}".format(filename))
if valid:
fetch_seqs(filename, valid, "valid")
else:
click.echo("No valid accessions found")
if invalid:
fetch_seqs(filename, invalid, "invalid")
click.echo("=========================\nGenerated file invalid.fa")
os.system("esl-seqstat {}".format("invalid.fa"))
click.echo("Upload file invalid.fa to NCBI BLAST")
else:
click.echo("No invalid accessions found")


@click.group()
def cli():
pass


def validate(seed):
"""
Convert SEED to a fasta file containing sequences with unknown IDs.
"""
if not os.path.exists(seed):
raise Exception("Error: SEED does not exist")
fasta = "seed.fasta"
cmd = "esl-reformat fasta {} > {}".format(seed, fasta)
os.system(cmd)
print(fasta)
parse_fasta(fasta)


@cli.command()
@click.argument("invalid", type=click.Path(exists=True))
def blast_invalid_sequences(invalid, blast_program="blastn"):
"""
Upload the file `invalid.fa` to NCBI BLAST and download results in the `Single-file JSON` format
"""

blast_results = []
fasta_file = "invalid.fa"
sequences = SeqIO.parse(fasta_file, "fasta")

for seq in sequences:
result_handle = NCBIWWW.qblast("blastn", "nt", seq.seq)

blast_results.append(result_handle.read())

output_file = "blast_results.xml"
with open(output_file, "w") as output:
for res in blast_results:
output.write(res)

with open(output_file, "r") as xml_file:
xml_data = xml_file.read()

xml_dict = xmltodict.parse(xml_data)
json_data = json.dumps(xml_dict, indent=4)

with open("output.json", "w") as json_file:
json_file.write(json_data)

print(f"BLAST search completed. Results saved to {json_file}")


@cli.command()
@click.argument("blast_json", type=click.Path(exists=True))
@click.option(
"--identity",
default=IDENTITY,
type=click.FLOAT,
help="Minimum % identity between query and target",
)
@click.option(
"--query_coverage",
default=QUERY_COVERAGE,
type=click.FLOAT,
help="Minimum coverage of the seed sequence",
)
def replace(blast_json, identity, query_coverage):
"""
Replace unknown accessions in SEED alignment using NCBI BLAST results
"""
blast_data = get_blast_data(blast_json)
fasta = choose_replacement(blast_data, identity, query_coverage)
generate_new_seed(fasta)
click.echo("Done")


@cli.command()
@click.argument("seed", type=click.Path(exists=True))
def all(seed):
"""
Carry out all steps
"""
validate(seed)
blast_result = blast_invalid_sequences("invalid.fa")
print(blast_result)
replace(blast_json=blast_result)


if __name__ == "__main__":
cli()
Loading