|
| 1 | +""" |
| 2 | +Usage: |
| 3 | +
|
| 4 | +# navigate to the folder with the SEED and CM files |
| 5 | +cd /path/to/folder/with/SEED |
| 6 | +
|
| 7 | +# identify invalid accessions that need to be replaced |
| 8 | +rfblast.py validate |
| 9 | +
|
| 10 | +# manually upload the file `invalid.fa` to NCBI BLAST |
| 11 | +# download results in the `Single-file JSON` format |
| 12 | +
|
| 13 | +# replace invalid accessions in the SEED file |
| 14 | +rfblast.py replace XXXXXXXXXXX-Alignment.json |
| 15 | +
|
| 16 | +# where XXXXXXXXXXX-Alignment.json is the NCBI BLAST result. |
| 17 | +""" |
| 18 | + |
| 19 | +import json |
| 20 | +import os |
| 21 | +import re |
| 22 | +import xmltodict |
| 23 | + |
| 24 | +import click |
| 25 | +import requests |
| 26 | + |
| 27 | +from Bio.Blast import NCBIWWW |
| 28 | +from Bio import SeqIO |
| 29 | + |
| 30 | + |
| 31 | +IDENTITY = 90 |
| 32 | +QUERY_COVERAGE = 70 |
| 33 | +TEMPDIR = "temp" |
| 34 | + |
| 35 | + |
| 36 | +def get_accession(gid): |
| 37 | + """ |
| 38 | + Get versioned accession, for example: |
| 39 | + Input: |
| 40 | + gi|2047803076|gb|CP061286.1| |
| 41 | + Output: |
| 42 | + CP061286.1 |
| 43 | + """ |
| 44 | + parts = gid.split("|") |
| 45 | + return parts[-2] |
| 46 | + |
| 47 | + |
| 48 | +def get_blast_data(filename): |
| 49 | + """ |
| 50 | + Load BLAST JSON output. |
| 51 | + """ |
| 52 | + with open(filename, "r") as f: |
| 53 | + return json.load(f) |
| 54 | + |
| 55 | + |
| 56 | +def choose_replacement(data, min_identity, min_query_coverage): |
| 57 | + """ |
| 58 | + Loop over BLAST results and pick best replacement for each hit. |
| 59 | + """ |
| 60 | + # do not pick replacement from the same accession if already seen |
| 61 | + fasta = "" |
| 62 | + seen_accessions = set() |
| 63 | + for query_num, search in enumerate(data["BlastOutput2"]): |
| 64 | + query_title = search["report"]["results"]["search"]["query_title"] |
| 65 | + query_len = search["report"]["results"]["search"]["query_len"] |
| 66 | + replacement_found = False |
| 67 | + for entry in search["report"]["results"]["search"]["hits"]: |
| 68 | + acc = get_accession(entry["description"][0]["id"]) |
| 69 | + if acc not in seen_accessions: |
| 70 | + seen_accessions.add(acc) |
| 71 | + replacement_found = True |
| 72 | + else: |
| 73 | + continue |
| 74 | + sequence = entry["hsps"][0]["hseq"] |
| 75 | + start = entry["hsps"][0]["hit_from"] |
| 76 | + end = entry["hsps"][0]["hit_to"] |
| 77 | + align_len = entry["hsps"][0]["align_len"] |
| 78 | + gaps = entry["hsps"][0]["gaps"] |
| 79 | + exact_matches = entry["hsps"][0]["identity"] |
| 80 | + identity = float(exact_matches) / align_len * 100 |
| 81 | + query_coverage = float(align_len - gaps) / query_len * 100 |
| 82 | + target_coverage = float(align_len - gaps) / len(sequence) * 100 |
| 83 | + if identity >= min_identity and query_coverage >= min_query_coverage: |
| 84 | + warning = False |
| 85 | + else: |
| 86 | + warning = True |
| 87 | + summary = ( |
| 88 | + "#{query_num} {message} {query_title} " |
| 89 | + "with {acc}/{start}-{end} at {identity}% identity; " |
| 90 | + "{gaps} gaps; query coverage {query_coverage}" |
| 91 | + ).format( |
| 92 | + acc=acc, |
| 93 | + start=start, |
| 94 | + end=end, |
| 95 | + query_title=query_title, |
| 96 | + identity=round(identity), |
| 97 | + query_coverage=round(query_coverage), |
| 98 | + target_coverage=round(target_coverage, 2), |
| 99 | + gaps=gaps, |
| 100 | + message="Replace" |
| 101 | + if not warning |
| 102 | + else " WARNING: No replacement found for", |
| 103 | + query_num=query_num + 1, |
| 104 | + ) |
| 105 | + print(summary) |
| 106 | + if not warning: |
| 107 | + fasta += ">{acc}/{start}-{end}\n{sequence}\n".format( |
| 108 | + acc=acc, |
| 109 | + start=start, |
| 110 | + end=end, |
| 111 | + sequence=sequence.replace("-", "").replace("T", "U"), |
| 112 | + ) |
| 113 | + if replacement_found: |
| 114 | + break |
| 115 | + return fasta |
| 116 | + |
| 117 | + |
| 118 | +def generate_new_seed(fasta): |
| 119 | + filename = "replacement.fasta" |
| 120 | + with open(filename, "w") as f: |
| 121 | + f.write(fasta) |
| 122 | + if not os.path.exists("CM"): |
| 123 | + cmd = "rfsearch.pl -t 30 -nodesc -relax" |
| 124 | + os.system(cmd) |
| 125 | + if not os.path.exists("SEED"): |
| 126 | + raise Exception("Error: SEED file not found") |
| 127 | + cmd = ( |
| 128 | + "cmalign --mapali SEED --noprob CM {} > tempseed && " |
| 129 | + "esl-reformat pfam tempseed > NEWSEEDtemp && " |
| 130 | + "rm tempseed" |
| 131 | + ).format(filename) |
| 132 | + os.system(cmd) |
| 133 | + invalid = set() |
| 134 | + with open("invalid.txt", "r") as f: |
| 135 | + for line in f: |
| 136 | + invalid.add(line.strip()) |
| 137 | + newseed = open("NEWSEED", "w") |
| 138 | + with open("NEWSEEDtemp", "r") as f: |
| 139 | + for line in f: |
| 140 | + skip = False |
| 141 | + for invalid_accession in invalid: |
| 142 | + if invalid_accession in line: |
| 143 | + skip = True |
| 144 | + break |
| 145 | + if not skip: |
| 146 | + newseed.write(line) |
| 147 | + newseed.close() |
| 148 | + os.remove("NEWSEEDtemp") |
| 149 | + cmd = ( |
| 150 | + 'echo "Old SEED info:" && esl-alistat SEED && ' |
| 151 | + 'echo "New SEED info:" && esl-alistat NEWSEED' |
| 152 | + ) |
| 153 | + os.system(cmd) |
| 154 | + |
| 155 | + |
| 156 | +def is_valid_accession(accession): |
| 157 | + """ |
| 158 | + TB03JUN2009E__Contig_2000/988-772 |
| 159 | + NZ_CP007501.1/771730-771924 |
| 160 | +
|
| 161 | + Found: |
| 162 | + {"header":{"type":"esearch","version":"0.3"},"esearchresult":{"count":"1","retmax":"1","retstart":"0","idlist":["1119664412"],"translationset":[],"querytranslation":""}} |
| 163 | + Found but a different ID: |
| 164 | + {"header":{"type":"esearch","version":"0.3"},"esearchresult":{"count":"1","retmax":"1","retstart":"0","idlist":["EP994606.1"],"translationset":[],"translationstack":[{"term":"JCVI_SCAF_1096627298421[All Fields]","field":"All Fields","count":"1","explode":"N"},"GROUP"],"querytranslation":"JCVI_SCAF_1096627298421[All Fields]"}} |
| 165 | + Not found: |
| 166 | + {"header":{"type":"esearch","version":"0.3"},"esearchresult":{"count":"0","retmax":"0","retstart":"0","idlist":[],"translationset":[],"querytranslation":"(TB03JUN2009E__Contig_2000[All Fields])","errorlist":{"phrasesnotfound":["TB03JUN2009E__Contig_2000"],"fieldsnotfound":[]},"warninglist":{"phrasesignored":[],"quotedphrasesnotfound":[],"outputmessages":["No items found."]}}} |
| 167 | + """ |
| 168 | + parts = accession.split("/") |
| 169 | + url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=nucleotide&term={}&retmode=json&idtype=acc" |
| 170 | + r = requests.get(url.format(parts[0])) |
| 171 | + data = r.json() |
| 172 | + if ( |
| 173 | + int(data["esearchresult"]["count"]) == 1 |
| 174 | + and len(data["esearchresult"]["idlist"]) > 0 |
| 175 | + and data["esearchresult"]["idlist"][0] == parts[0] |
| 176 | + ): |
| 177 | + return True |
| 178 | + else: |
| 179 | + return False |
| 180 | + |
| 181 | + |
| 182 | +def fetch_seqs(filename, accessions, label): |
| 183 | + """ """ |
| 184 | + f_txt = "{}.txt".format(label) |
| 185 | + f_fa = "{}.fa".format(label) |
| 186 | + with open(f_txt, "w") as f: |
| 187 | + for accession in accessions: |
| 188 | + f.write(accession + "\n") |
| 189 | + cmd = "esl-sfetch -f {} {} > {}".format(filename, f_txt, f_fa) |
| 190 | + os.system(cmd) |
| 191 | + click.echo("Saved {} {} accessions in {}".format(len(accessions), label, f_fa)) |
| 192 | + |
| 193 | + |
| 194 | +def parse_fasta(filename): |
| 195 | + """ """ |
| 196 | + accessions = set() |
| 197 | + with open(filename, "r") as f: |
| 198 | + for line in f: |
| 199 | + if line.startswith(">"): |
| 200 | + parts = re.split(r"\s+", line) |
| 201 | + accession = parts[0].replace(">", "") |
| 202 | + accessions.add(accession) |
| 203 | + valid = set() |
| 204 | + invalid = set() |
| 205 | + for accession in accessions: |
| 206 | + if is_valid_accession(accession): |
| 207 | + click.echo("{} is valid".format(accession)) |
| 208 | + valid.add(accession) |
| 209 | + else: |
| 210 | + click.echo("{} is invalid".format(accession)) |
| 211 | + invalid.add(accession) |
| 212 | + os.system("esl-sfetch --index {}".format(filename)) |
| 213 | + if valid: |
| 214 | + fetch_seqs(filename, valid, "valid") |
| 215 | + else: |
| 216 | + click.echo("No valid accessions found") |
| 217 | + if invalid: |
| 218 | + fetch_seqs(filename, invalid, "invalid") |
| 219 | + click.echo("=========================\nGenerated file invalid.fa") |
| 220 | + os.system("esl-seqstat {}".format("invalid.fa")) |
| 221 | + click.echo("Upload file invalid.fa to NCBI BLAST") |
| 222 | + else: |
| 223 | + click.echo("No invalid accessions found") |
| 224 | + |
| 225 | + |
| 226 | +@click.group() |
| 227 | +def cli(): |
| 228 | + pass |
| 229 | + |
| 230 | + |
| 231 | +def validate(seed): |
| 232 | + """ |
| 233 | + Convert SEED to a fasta file containing sequences with unknown IDs. |
| 234 | + """ |
| 235 | + if not os.path.exists(seed): |
| 236 | + raise Exception("Error: SEED does not exist") |
| 237 | + fasta = "seed.fasta" |
| 238 | + cmd = "esl-reformat fasta {} > {}".format(seed, fasta) |
| 239 | + os.system(cmd) |
| 240 | + print(fasta) |
| 241 | + parse_fasta(fasta) |
| 242 | + |
| 243 | + |
| 244 | +@cli.command() |
| 245 | +@click.argument("invalid", type=click.Path(exists=True)) |
| 246 | +def blast_invalid_sequences(invalid, blast_program="blastn"): |
| 247 | + """ |
| 248 | + Upload the file `invalid.fa` to NCBI BLAST and download results in the `Single-file JSON` format |
| 249 | + """ |
| 250 | + |
| 251 | + blast_results = [] |
| 252 | + fasta_file = "invalid.fa" |
| 253 | + sequences = SeqIO.parse(fasta_file, "fasta") |
| 254 | + |
| 255 | + for seq in sequences: |
| 256 | + result_handle = NCBIWWW.qblast("blastn", "nt", seq.seq) |
| 257 | + |
| 258 | + blast_results.append(result_handle.read()) |
| 259 | + |
| 260 | + output_file = "blast_results.xml" |
| 261 | + with open(output_file, "w") as output: |
| 262 | + for res in blast_results: |
| 263 | + output.write(res) |
| 264 | + |
| 265 | + with open(output_file, "r") as xml_file: |
| 266 | + xml_data = xml_file.read() |
| 267 | + |
| 268 | + xml_dict = xmltodict.parse(xml_data) |
| 269 | + json_data = json.dumps(xml_dict, indent=4) |
| 270 | + |
| 271 | + with open("output.json", "w") as json_file: |
| 272 | + json_file.write(json_data) |
| 273 | + |
| 274 | + print(f"BLAST search completed. Results saved to {json_file}") |
| 275 | + |
| 276 | + |
| 277 | +@cli.command() |
| 278 | +@click.argument("blast_json", type=click.Path(exists=True)) |
| 279 | +@click.option( |
| 280 | + "--identity", |
| 281 | + default=IDENTITY, |
| 282 | + type=click.FLOAT, |
| 283 | + help="Minimum % identity between query and target", |
| 284 | +) |
| 285 | +@click.option( |
| 286 | + "--query_coverage", |
| 287 | + default=QUERY_COVERAGE, |
| 288 | + type=click.FLOAT, |
| 289 | + help="Minimum coverage of the seed sequence", |
| 290 | +) |
| 291 | +def replace(blast_json, identity, query_coverage): |
| 292 | + """ |
| 293 | + Replace unknown accessions in SEED alignment using NCBI BLAST results |
| 294 | + """ |
| 295 | + blast_data = get_blast_data(blast_json) |
| 296 | + fasta = choose_replacement(blast_data, identity, query_coverage) |
| 297 | + generate_new_seed(fasta) |
| 298 | + click.echo("Done") |
| 299 | + |
| 300 | + |
| 301 | +@cli.command() |
| 302 | +@click.argument("seed", type=click.Path(exists=True)) |
| 303 | +def all(seed): |
| 304 | + """ |
| 305 | + Carry out all steps |
| 306 | + """ |
| 307 | + validate(seed) |
| 308 | + blast_result = blast_invalid_sequences("invalid.fa") |
| 309 | + print(blast_result) |
| 310 | + replace(blast_json=blast_result) |
| 311 | + |
| 312 | + |
| 313 | +if __name__ == "__main__": |
| 314 | + cli() |
0 commit comments