Skip to content

Commit bf4a31b

Browse files
authored
update Db scripts (#367)
* update NCBI NRP downloads * replace lxml by python xml lib * introduce COG24, discard PCLA and merge with UPS/NRP matching * discard xopen threads parameter forstalling external processes * disable alive_progress output enrichments * delete debugging comments * refactor code * discard old COG NCBI NRP scripts * mute alive_bar enriched prints * fix NCBIfams parsing * tune hmmsearch block size * set SQLITE TMPDIR
1 parent 2abea0a commit bf4a31b

16 files changed

+480
-588
lines changed

db-scripts/annotate-cog.py

-108
This file was deleted.

db-scripts/annotate-is.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
conn.execute('PRAGMA threads = 2;')
4646
conn.commit()
4747

48-
with ips_alignments_path.open() as fh, alive_bar() as bar:
48+
with ips_alignments_path.open() as fh, alive_bar(enrich_print=False) as bar:
4949
for line in fh:
5050
(uniref100_id, sseqid, stitle, length, pident, qlen, slen, evalue) = line.strip().split('\t')
5151
length = int(length)
@@ -73,7 +73,7 @@
7373
log_ips.debug('summary: IPS annotated=%i', ips_updated)
7474

7575

76-
with psc_alignments_path.open() as fh, alive_bar() as bar:
76+
with psc_alignments_path.open() as fh, alive_bar(enrich_print=False) as bar:
7777
for line in fh:
7878
(uniref90_id, sseqid, stitle, length, pident, qlen, slen, evalue) = line.strip().split('\t')
7979
length = int(length)

db-scripts/annotate-kofams.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161

6262
print('parse Kofam hits...')
6363
hit_per_psc = {}
64-
with hmm_result_path.open() as fh, alive_bar() as bar:
64+
with hmm_result_path.open() as fh, alive_bar(enrich_print=False) as bar:
6565
for line in fh:
6666
if(line[0] != '#'):
6767
(psc_id, _, hmm_name, hmm_id, evalue, bitscore, _) = re.split(r'\s+', line.strip(), maxsplit=6)
@@ -84,7 +84,7 @@
8484

8585
psc_annotated = 0
8686
ecs_added = 0
87-
with sqlite3.connect(str(db_path), isolation_level='EXCLUSIVE') as conn, alive_bar(total=len(hit_per_psc)) as bar:
87+
with sqlite3.connect(str(db_path), isolation_level='EXCLUSIVE') as conn, alive_bar(total=len(hit_per_psc), enrich_print=False) as bar:
8888
conn.execute('PRAGMA page_size = 4096;')
8989
conn.execute('PRAGMA cache_size = 100000;')
9090
conn.execute('PRAGMA locking_mode = EXCLUSIVE;')

db-scripts/annotate-ncbi-amr.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848

4949
print('lookup IPS by AMR NRP id (WP_*) and update gene/product annotations...')
5050
conn.row_factory = sqlite3.Row
51-
with genes_path.open() as fh, alive_bar() as bar:
51+
with genes_path.open() as fh, alive_bar(enrich_print=False) as bar:
5252
for line in fh:
5353
nrps_processed += 1
5454
(

db-scripts/annotate-ncbi-fams.py

+26-25
Original file line numberDiff line numberDiff line change
@@ -37,32 +37,33 @@
3737

3838
print('parse NCBIfam HMMs...')
3939
hmms = {}
40-
with hmms_path.open() as fh, alive_bar() as bar:
40+
with hmms_path.open() as fh, alive_bar(enrich_print=False) as bar:
4141
for line in fh:
4242
(
43-
ncbi_accession,
44-
source_identifier,
45-
label,
46-
sequence_cutoff,
47-
domain_cutoff,
48-
hmm_length,
49-
family_type,
50-
for_structural_annotation,
51-
for_naming,
52-
for_AMRFinder,
53-
product_name,
54-
gene_symbol,
55-
ec_numbers,
56-
go_terms,
57-
pmids,
58-
taxonomic_range,
59-
taxonomic_range_name,
60-
taxonomic_rank_name,
61-
n_refseq_protein_hits,
62-
source,
43+
ncbi_accession,
44+
source_identifier,
45+
label,
46+
sequence_cutoff,
47+
domain_cutoff,
48+
hmm_length,
49+
family_type,
50+
for_structural_annotation,
51+
for_naming,
52+
for_AMRFinder,
53+
product_name,
54+
gene_symbol,
55+
gene_synonyms,
56+
ec_numbers,
57+
go_terms,
58+
pmids,
59+
taxonomic_range,
60+
taxonomic_range_name,
61+
taxonomic_rank_name,
62+
n_refseq_protein_hits,
63+
source,
6364
name_orig,
64-
_1,
65-
_2
65+
hmm_name,
66+
comment
6667
) = line.split('\t')
6768
if(family_type in family_type_ranks.keys() and for_naming == 'Y'): # only accept exception/equivalog HMMs eligible for naming proteins
6869
hmm = {
@@ -84,7 +85,7 @@
8485

8586
print('parse NCBIfam hits...')
8687
hit_per_psc = {}
87-
with hmm_result_path.open() as fh, alive_bar() as bar:
88+
with hmm_result_path.open() as fh, alive_bar(enrich_print=False) as bar:
8889
for line in fh:
8990
if(line[0] != '#'):
9091
(psc_id, _, hmm_name, hmm_id, evalue, bitscore, _) = re.split(r'\s+', line.strip(), maxsplit=6)
@@ -116,7 +117,7 @@
116117
psc_with_gene = 0
117118
psc_with_ec = 0
118119
psc_with_go = 0
119-
with sqlite3.connect(str(db_path), isolation_level='EXCLUSIVE') as conn, alive_bar(total=len(hit_per_psc)) as bar:
120+
with sqlite3.connect(str(db_path), isolation_level='EXCLUSIVE') as conn, alive_bar(total=len(hit_per_psc), enrich_print=False) as bar:
120121
conn.execute('PRAGMA page_size = 4096;')
121122
conn.execute('PRAGMA cache_size = 100000;')
122123
conn.execute('PRAGMA locking_mode = EXCLUSIVE;')

db-scripts/annotate-ncbi-nrp-cog.py

+145
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
2+
import argparse
3+
import logging
4+
import hashlib
5+
import sqlite3
6+
7+
from pathlib import Path
8+
9+
from alive_progress import alive_bar
10+
from Bio import SeqIO
11+
12+
13+
parser = argparse.ArgumentParser(
14+
description='Annotate IPSs by NCBI nrp IDs and PSCs by nrp cluster gene labels.'
15+
)
16+
parser.add_argument('--db', action='store', help='Path to Bakta db file.')
17+
parser.add_argument('--nrp', action='store', help='Path to NCBI NRP fasta file.')
18+
parser.add_argument('--cog-ids', action='store', dest='cog_ids', help='Path to NCBI COG id / functional class file.')
19+
parser.add_argument('--nrp-cog-mapping', action='store', dest='nrp_cog_mapping', help='Path to GI / COG mapping file.')
20+
args = parser.parse_args()
21+
22+
23+
db_path = Path(args.db).resolve()
24+
nrp_path = Path(args.nrp).resolve()
25+
cog_ids_path = Path(args.cog_ids).resolve()
26+
nrp_cog_mapping_path = Path(args.nrp_cog_mapping).resolve()
27+
28+
29+
logging.basicConfig(
30+
filename='bakta.db.log',
31+
filemode='a',
32+
format='%(name)s - NCBI-NRP-COG - %(levelname)s - %(message)s',
33+
level=logging.INFO
34+
)
35+
log_ups = logging.getLogger('UPS')
36+
log_ips = logging.getLogger('IPS')
37+
log_psc = logging.getLogger('PSC')
38+
39+
40+
print('import NCBI COG id / function class information...')
41+
cogs_by_id = {}
42+
with cog_ids_path.open() as fh:
43+
# with cog_ids_path.open(encoding='windows-1252') as fh:
44+
for line in fh:
45+
if(line[0] != '#'):
46+
(id, cat, product, gene, pathways, pubmed, pdb) = line.split('\t')
47+
if(product.lower() == 'hypothetical protein' or product.lower() == 'uncharacterized protein' or product.lower() == 'uncharacterized conserved protein' or product.lower() == 'Uncharacterized membrane protein'):
48+
product = None
49+
if(gene == ''):
50+
gene = None
51+
if(cat == ''):
52+
cat = None
53+
cogs_by_id[id] = {
54+
'id': id,
55+
'cat': cat,
56+
'product': product,
57+
'gene': gene
58+
}
59+
print(f'\tstored COGs: {len(cogs_by_id)}')
60+
61+
print('import NCBI GI / COG mapping information...')
62+
nrp_to_cog = {}
63+
with nrp_cog_mapping_path.open() as fh:
64+
for line in fh:
65+
cols = line.strip().split(',')
66+
# (gene_id, assembly_id, gb_id, protein_length, footprint_coordinates_protein, cog_footprint_length, cog_id, reserved_, cog_membership_class, bitscore, evalue, profile_length, footprint_coordinates_profile)
67+
cog_id = cols[6]
68+
nrp_id = cols[2]
69+
if cog_id != '' and nrp_id.startswith('WP_') and cog_id in cogs_by_id:
70+
nrp_to_cog[nrp_id] = cog_id
71+
print(f'\tmapped COG IDs: {len(nrp_to_cog)}')
72+
73+
74+
print('lookup UPS by NRP hash & update WP_* / gene annotations...')
75+
nrps_processed = 0
76+
nrps_not_found = 0
77+
nrps_wo_ips = 0
78+
ups_updated = 0
79+
psc_updated_gene = 0
80+
psc_updated_product = 0
81+
ncbi_nrp_path = Path(args.nrp).resolve()
82+
uniref90_with_cog_annotations = set()
83+
with ncbi_nrp_path.open() as fh, sqlite3.connect(str(db_path), isolation_level='EXCLUSIVE') as conn, alive_bar(enrich_print=False) as bar:
84+
conn.execute('PRAGMA page_size = 4096;')
85+
conn.execute('PRAGMA cache_size = 100000;')
86+
conn.execute('PRAGMA locking_mode = EXCLUSIVE;')
87+
conn.execute(f'PRAGMA mmap_size = {20 * 1024 * 1024 * 1024};')
88+
conn.execute('PRAGMA synchronous = OFF;')
89+
conn.execute('PRAGMA journal_mode = OFF')
90+
conn.execute('PRAGMA threads = 2;')
91+
conn.commit()
92+
conn.row_factory = sqlite3.Row
93+
for record in SeqIO.parse(fh, 'fasta'):
94+
nrps_processed += 1
95+
nrp_id = record.id
96+
seq = str(record.seq).upper()
97+
seq_hash = hashlib.md5(seq.encode())
98+
seq_hash_digest = seq_hash.digest()
99+
seq_hash_hexdigest = seq_hash.hexdigest()
100+
rec_ups = conn.execute('SELECT length, uniref100_id FROM ups WHERE hash=?', (seq_hash_digest,)).fetchone()
101+
if(rec_ups is not None):
102+
assert rec_ups['length'] == len(seq), f"Detected hash duplicate with different seq length! hash={seq_hash_hexdigest}, NCRBI-NRP-id={nrp_id}, UniParc-id={rec_ups['uniparc_id']}, NCRBI-NRP-length={rec_ups['length']}, db-length={rec_ups['length']}"
103+
conn.execute('UPDATE ups SET ncbi_nrp_id=? WHERE hash=?', (nrp_id[3:], seq_hash_digest)) # annotate UPS with NCBI nrp id (WP_*)
104+
log_ups.info('UPDATE ups SET ncbi_nrp_id=%s WHERE hash=%s', nrp_id[3:], seq_hash_hexdigest)
105+
ups_updated += 1
106+
if(rec_ups['uniref100_id']):
107+
rec_ips = conn.execute('SELECT * FROM ips WHERE uniref100_id=?', (rec_ups['uniref100_id'],)).fetchone()
108+
if(rec_ips is not None):
109+
uniref90_id = rec_ips['uniref90_id']
110+
cog_id = nrp_to_cog.get(nrp_id, None)
111+
if(uniref90_id is not None and cog_id is not None and uniref90_id not in uniref90_with_cog_annotations): # skip known PSC/COG annotations
112+
uniref90_with_cog_annotations.add(uniref90_id)
113+
cog = cogs_by_id[cog_id] # assured in line 67
114+
conn.execute('UPDATE psc SET cog_id=?, cog_category=? WHERE uniref90_id=?', (cog['id'][3:], cog['cat'], uniref90_id))
115+
log_psc.info('UPDATE psc SET cog_id=%s, cog_category=%s WHERE uniref90_id=%s', cog['id'], cog['cat'], uniref90_id)
116+
117+
rec_psc = conn.execute('SELECT gene, product FROM psc WHERE uniref90_id=?', (uniref90_id,)).fetchone() # get current PSC gene/product annotation
118+
if(rec_psc is not None):
119+
if(cog['gene'] is not None and rec_psc['gene'] is None): # update PSC gene annotation if non-existent
120+
conn.execute('UPDATE psc SET gene=? WHERE uniref90_id=?', (cog['gene'], uniref90_id))
121+
log_psc.info('UPDATE psc SET gene=%s WHERE uniref90_id=%s', cog['gene'], uniref90_id)
122+
psc_updated_gene += 1
123+
if(cog['product'] is not None and rec_psc['product'] is None): # update PSC product annotation if non-existent
124+
conn.execute('UPDATE psc SET product=? WHERE uniref90_id=?', (cog['product'], uniref90_id))
125+
log_psc.info('UPDATE psc SET product=%s WHERE uniref90_id=%s', cog['product'], uniref90_id)
126+
psc_updated_product += 1
127+
else:
128+
nrps_wo_ips += 1
129+
else:
130+
nrps_wo_ips += 1
131+
else:
132+
nrps_not_found += 1
133+
if((nrps_processed % 1_000_000) == 0):
134+
conn.commit()
135+
bar()
136+
conn.commit()
137+
print(f'NRPs processed: {nrps_processed}')
138+
log_ups.debug('summary: # UPS with annotated NRP IDs=%i', ups_updated)
139+
print(f'UPSs with annotated WP_* id: {ups_updated}')
140+
log_psc.debug('summary: # PSC with annotated genes=%i', psc_updated_gene)
141+
print(f'PSCs with annotated gene names: {psc_updated_gene}')
142+
log_psc.debug('summary: # PSC with annotated products=%i', psc_updated_product)
143+
print(f'PSCs with annotated gene products: {psc_updated_product}')
144+
print(f'NRPs not found: {nrps_not_found}')
145+
print(f'NRPs w/o IPS: {nrps_wo_ips}')

0 commit comments

Comments
 (0)