|
| 1 | + |
| 2 | +import argparse |
| 3 | +import logging |
| 4 | +import hashlib |
| 5 | +import sqlite3 |
| 6 | + |
| 7 | +from pathlib import Path |
| 8 | + |
| 9 | +from alive_progress import alive_bar |
| 10 | +from Bio import SeqIO |
| 11 | + |
| 12 | + |
| 13 | +parser = argparse.ArgumentParser( |
| 14 | + description='Annotate IPSs by NCBI nrp IDs and PSCs by nrp cluster gene labels.' |
| 15 | +) |
| 16 | +parser.add_argument('--db', action='store', help='Path to Bakta db file.') |
| 17 | +parser.add_argument('--nrp', action='store', help='Path to NCBI NRP fasta file.') |
| 18 | +parser.add_argument('--cog-ids', action='store', dest='cog_ids', help='Path to NCBI COG id / functional class file.') |
| 19 | +parser.add_argument('--nrp-cog-mapping', action='store', dest='nrp_cog_mapping', help='Path to GI / COG mapping file.') |
| 20 | +args = parser.parse_args() |
| 21 | + |
| 22 | + |
| 23 | +db_path = Path(args.db).resolve() |
| 24 | +nrp_path = Path(args.nrp).resolve() |
| 25 | +cog_ids_path = Path(args.cog_ids).resolve() |
| 26 | +nrp_cog_mapping_path = Path(args.nrp_cog_mapping).resolve() |
| 27 | + |
| 28 | + |
| 29 | +logging.basicConfig( |
| 30 | + filename='bakta.db.log', |
| 31 | + filemode='a', |
| 32 | + format='%(name)s - NCBI-NRP-COG - %(levelname)s - %(message)s', |
| 33 | + level=logging.INFO |
| 34 | +) |
| 35 | +log_ups = logging.getLogger('UPS') |
| 36 | +log_ips = logging.getLogger('IPS') |
| 37 | +log_psc = logging.getLogger('PSC') |
| 38 | + |
| 39 | + |
| 40 | +print('import NCBI COG id / function class information...') |
| 41 | +cogs_by_id = {} |
| 42 | +with cog_ids_path.open() as fh: |
| 43 | +# with cog_ids_path.open(encoding='windows-1252') as fh: |
| 44 | + for line in fh: |
| 45 | + if(line[0] != '#'): |
| 46 | + (id, cat, product, gene, pathways, pubmed, pdb) = line.split('\t') |
| 47 | + if(product.lower() == 'hypothetical protein' or product.lower() == 'uncharacterized protein' or product.lower() == 'uncharacterized conserved protein' or product.lower() == 'Uncharacterized membrane protein'): |
| 48 | + product = None |
| 49 | + if(gene == ''): |
| 50 | + gene = None |
| 51 | + if(cat == ''): |
| 52 | + cat = None |
| 53 | + cogs_by_id[id] = { |
| 54 | + 'id': id, |
| 55 | + 'cat': cat, |
| 56 | + 'product': product, |
| 57 | + 'gene': gene |
| 58 | + } |
| 59 | +print(f'\tstored COGs: {len(cogs_by_id)}') |
| 60 | + |
| 61 | +print('import NCBI GI / COG mapping information...') |
| 62 | +nrp_to_cog = {} |
| 63 | +with nrp_cog_mapping_path.open() as fh: |
| 64 | + for line in fh: |
| 65 | + cols = line.strip().split(',') |
| 66 | + # (gene_id, assembly_id, gb_id, protein_length, footprint_coordinates_protein, cog_footprint_length, cog_id, reserved_, cog_membership_class, bitscore, evalue, profile_length, footprint_coordinates_profile) |
| 67 | + cog_id = cols[6] |
| 68 | + nrp_id = cols[2] |
| 69 | + if cog_id != '' and nrp_id.startswith('WP_') and cog_id in cogs_by_id: |
| 70 | + nrp_to_cog[nrp_id] = cog_id |
| 71 | +print(f'\tmapped COG IDs: {len(nrp_to_cog)}') |
| 72 | + |
| 73 | + |
| 74 | +print('lookup UPS by NRP hash & update WP_* / gene annotations...') |
| 75 | +nrps_processed = 0 |
| 76 | +nrps_not_found = 0 |
| 77 | +nrps_wo_ips = 0 |
| 78 | +ups_updated = 0 |
| 79 | +psc_updated_gene = 0 |
| 80 | +psc_updated_product = 0 |
| 81 | +ncbi_nrp_path = Path(args.nrp).resolve() |
| 82 | +uniref90_with_cog_annotations = set() |
| 83 | +with ncbi_nrp_path.open() as fh, sqlite3.connect(str(db_path), isolation_level='EXCLUSIVE') as conn, alive_bar(enrich_print=False) as bar: |
| 84 | + conn.execute('PRAGMA page_size = 4096;') |
| 85 | + conn.execute('PRAGMA cache_size = 100000;') |
| 86 | + conn.execute('PRAGMA locking_mode = EXCLUSIVE;') |
| 87 | + conn.execute(f'PRAGMA mmap_size = {20 * 1024 * 1024 * 1024};') |
| 88 | + conn.execute('PRAGMA synchronous = OFF;') |
| 89 | + conn.execute('PRAGMA journal_mode = OFF') |
| 90 | + conn.execute('PRAGMA threads = 2;') |
| 91 | + conn.commit() |
| 92 | + conn.row_factory = sqlite3.Row |
| 93 | + for record in SeqIO.parse(fh, 'fasta'): |
| 94 | + nrps_processed += 1 |
| 95 | + nrp_id = record.id |
| 96 | + seq = str(record.seq).upper() |
| 97 | + seq_hash = hashlib.md5(seq.encode()) |
| 98 | + seq_hash_digest = seq_hash.digest() |
| 99 | + seq_hash_hexdigest = seq_hash.hexdigest() |
| 100 | + rec_ups = conn.execute('SELECT length, uniref100_id FROM ups WHERE hash=?', (seq_hash_digest,)).fetchone() |
| 101 | + if(rec_ups is not None): |
| 102 | + assert rec_ups['length'] == len(seq), f"Detected hash duplicate with different seq length! hash={seq_hash_hexdigest}, NCRBI-NRP-id={nrp_id}, UniParc-id={rec_ups['uniparc_id']}, NCRBI-NRP-length={rec_ups['length']}, db-length={rec_ups['length']}" |
| 103 | + conn.execute('UPDATE ups SET ncbi_nrp_id=? WHERE hash=?', (nrp_id[3:], seq_hash_digest)) # annotate UPS with NCBI nrp id (WP_*) |
| 104 | + log_ups.info('UPDATE ups SET ncbi_nrp_id=%s WHERE hash=%s', nrp_id[3:], seq_hash_hexdigest) |
| 105 | + ups_updated += 1 |
| 106 | + if(rec_ups['uniref100_id']): |
| 107 | + rec_ips = conn.execute('SELECT * FROM ips WHERE uniref100_id=?', (rec_ups['uniref100_id'],)).fetchone() |
| 108 | + if(rec_ips is not None): |
| 109 | + uniref90_id = rec_ips['uniref90_id'] |
| 110 | + cog_id = nrp_to_cog.get(nrp_id, None) |
| 111 | + if(uniref90_id is not None and cog_id is not None and uniref90_id not in uniref90_with_cog_annotations): # skip known PSC/COG annotations |
| 112 | + uniref90_with_cog_annotations.add(uniref90_id) |
| 113 | + cog = cogs_by_id[cog_id] # assured in line 67 |
| 114 | + conn.execute('UPDATE psc SET cog_id=?, cog_category=? WHERE uniref90_id=?', (cog['id'][3:], cog['cat'], uniref90_id)) |
| 115 | + log_psc.info('UPDATE psc SET cog_id=%s, cog_category=%s WHERE uniref90_id=%s', cog['id'], cog['cat'], uniref90_id) |
| 116 | + |
| 117 | + rec_psc = conn.execute('SELECT gene, product FROM psc WHERE uniref90_id=?', (uniref90_id,)).fetchone() # get current PSC gene/product annotation |
| 118 | + if(rec_psc is not None): |
| 119 | + if(cog['gene'] is not None and rec_psc['gene'] is None): # update PSC gene annotation if non-existent |
| 120 | + conn.execute('UPDATE psc SET gene=? WHERE uniref90_id=?', (cog['gene'], uniref90_id)) |
| 121 | + log_psc.info('UPDATE psc SET gene=%s WHERE uniref90_id=%s', cog['gene'], uniref90_id) |
| 122 | + psc_updated_gene += 1 |
| 123 | + if(cog['product'] is not None and rec_psc['product'] is None): # update PSC product annotation if non-existent |
| 124 | + conn.execute('UPDATE psc SET product=? WHERE uniref90_id=?', (cog['product'], uniref90_id)) |
| 125 | + log_psc.info('UPDATE psc SET product=%s WHERE uniref90_id=%s', cog['product'], uniref90_id) |
| 126 | + psc_updated_product += 1 |
| 127 | + else: |
| 128 | + nrps_wo_ips += 1 |
| 129 | + else: |
| 130 | + nrps_wo_ips += 1 |
| 131 | + else: |
| 132 | + nrps_not_found += 1 |
| 133 | + if((nrps_processed % 1_000_000) == 0): |
| 134 | + conn.commit() |
| 135 | + bar() |
| 136 | + conn.commit() |
| 137 | +print(f'NRPs processed: {nrps_processed}') |
| 138 | +log_ups.debug('summary: # UPS with annotated NRP IDs=%i', ups_updated) |
| 139 | +print(f'UPSs with annotated WP_* id: {ups_updated}') |
| 140 | +log_psc.debug('summary: # PSC with annotated genes=%i', psc_updated_gene) |
| 141 | +print(f'PSCs with annotated gene names: {psc_updated_gene}') |
| 142 | +log_psc.debug('summary: # PSC with annotated products=%i', psc_updated_product) |
| 143 | +print(f'PSCs with annotated gene products: {psc_updated_product}') |
| 144 | +print(f'NRPs not found: {nrps_not_found}') |
| 145 | +print(f'NRPs w/o IPS: {nrps_wo_ips}') |
0 commit comments