Skip to content

Commit

Permalink
Update uniprot API
Browse files Browse the repository at this point in the history
  • Loading branch information
michellemli committed Sep 23, 2024
1 parent ae10f93 commit 18d1d3e
Showing 1 changed file with 19 additions and 2 deletions.
21 changes: 19 additions & 2 deletions finetune_pinnacle/extract_txdata_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from xml.etree import ElementTree


LEGACY_UNIPROT_API_URL = 'https://legacy.uniprot.org/uploadlists/'
UNIPROT_API_URL = 'https://rest.uniprot.org/idmapping/'
OT_URL = "https://api.platform.opentargets.org/api/v4/graphql"
TOTAL_MAX = 20000
Expand Down Expand Up @@ -211,14 +210,15 @@ def evidence2genename(drug_evidence_data: pd.DataFrame, ensg2otgenename: dict):

data = parse.urlencode(params)
data = data.encode('utf-8')
req = request.Request(LEGACY_UNIPROT_API_URL, data)
req = request.Request(UNIPROT_API_URL, data)
with request.urlopen(req) as f:
response = f.read()
res = response.decode('utf-8')
uniprot2name = {ins.split('\t')[0]:ins.split('\t')[1] for ins in res.split('\n')[1:-1]}

except:
# Adapted from https://www.uniprot.org/help/id_mapping
print("Retrying for Uniprot...")
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))
Expand Down Expand Up @@ -379,13 +379,30 @@ def get_id_mapping_results_stream(url):
for o in out:
ensg2name[o['query']] = o['symbol']

print("ensg2otgenename", len(ensg2otgenename))

# Not sure why these didn't get added
if "ENSG00000187733" not in ensg2otgenename: ensg2otgenename["ENSG00000187733"] = "AMY1C"
if "ENSG00000014138" not in ensg2otgenename: ensg2otgenename["ENSG00000014138"] = "POLA2"
if "ENSG00000062822" not in ensg2otgenename: ensg2otgenename["ENSG00000062822"] = "POLD1"
if "ENSG00000077514" not in ensg2otgenename: ensg2otgenename["ENSG00000077514"] = "POLD3"
if "ENSG00000100479" not in ensg2otgenename: ensg2otgenename["ENSG00000100479"] = "POLE2"
if "ENSG00000101868" not in ensg2otgenename: ensg2otgenename["ENSG00000101868"] = "POLA1"
if "ENSG00000106628" not in ensg2otgenename: ensg2otgenename["ENSG00000106628"] = "POLD2"
if "ENSG00000198056" not in ensg2otgenename: ensg2otgenename["ENSG00000198056"] = "PRIM1"
if "ENSG00000146143" not in ensg2otgenename: ensg2otgenename["ENSG00000146143"] = "PRIM2"
if "ENSG00000148229" not in ensg2otgenename: ensg2otgenename["ENSG00000148229"] = "POLE3"
if "ENSG00000167325" not in ensg2otgenename: ensg2otgenename["ENSG00000167325"] = "RRM1"
if "ENSG00000175482" not in ensg2otgenename: ensg2otgenename["ENSG00000175482"] = "POLD4"
if "ENSG00000177084" not in ensg2otgenename: ensg2otgenename["ENSG00000177084"] = "POLE"
if "ENSG00000142319" not in ensg2otgenename: ensg2otgenename["ENSG00000142319"] = "SLC6A3"

disease_drug_targets = set(uniprot2name.values())
disease_drug_targets.update(ensg2name.values())

# ENSG --> gene name through OT
missing_mappings = [ensg for ensg in drug_evidence_data.targetId if ensg not in ensg2otgenename]
if len(missing_mappings) > 0: print("MISSING MAPPINGS:", missing_mappings)
disease_drug_targets.update([ensg2otgenename[ensg] for ensg in drug_evidence_data.targetId])

print(f'Found {len(disease_drug_targets)} targets with clinically relevant evidence.')
Expand Down

0 comments on commit 18d1d3e

Please sign in to comment.