diff --git a/notebooks/uniprot_prefix_investigation/data/identifier_table_prefixes.txt b/notebooks/uniprot_prefix_investigation/data/identifier_table_prefixes.txt
new file mode 100644
index 0000000..e8684e4
--- /dev/null
+++ b/notebooks/uniprot_prefix_investigation/data/identifier_table_prefixes.txt
@@ -0,0 +1,171 @@
+ABCD
+AGR
+Agora
+Allergome
+AlphaFoldDB
+AntiFam
+Antibodypedia
+ArachnoServer
+Araport
+BMRB
+BRENDA
+Bgee
+BindingDB
+BioCyc
+BioGRID
+BioGRID-ORCS
+BioMuta
+CARD
+CAZy
+CCDS
+CD-CODE
+CDD
+CGD
+CIViC
+CORUM
+CPTAC
+CPTC
+CTD
+CarbonylDB
+ChEMBL
+ChiTaRS
+ClinPGx
+CollecTF
+ComplexPortal
+ConoServer
+DEPOD
+DIP
+DMDM
+DNASU
+DisGeNET
+DisProt
+DrugBank
+DrugCentral
+EC
+ELM
+EMDB
+ESTHER
+EchoBASE
+EnsemblBacteria
+EnsemblFungi
+EnsemblMetazoa
+EnsemblPlants
+EnsemblProtists
+EvolutionaryTrace
+ExpressionAtlas
+FlyBase
+FunCoup
+FunFam
+GO
+Gene3D
+GeneCards
+GeneID
+GeneReviews
+GeneTree
+GeneWiki
+GenomeRNAi
+GlyConnect
+GlyCosmos
+GlyGen
+Gramene
+GuidetoPHARMACOLOGY
+HAMAP
+HGNC
+HOGENOM
+HPA
+IDEAL
+IMGT_GENE-DB
+InParanoid
+IntAct
+InterPro
+JaponicusDB
+KEGG
+LegioList
+Leproma
+MEROPS
+MGI
+MIM
+MINT
+MaizeGDB
+MalaCards
+MassIVE
+MetOSite
+MoonDB
+MoonProt
+NCBITaxon
+NCBIfam
+NIAGADS
+OGP
+OMA
+OpenTargets
+Orphanet
+OrthoDB
+PAN-GO
+PANTHER
+PATRIC
+PCDDB
+PDB
+PDBsum
+PHI-base
+PIR
+PIRSF
+PRIDE
+PRINTS
+PRO
+PROSITE
+PathwayCommons
+PaxDb
+PeptideAtlas
+PeroxiBase
+Pfam
+Pharos
+PhosphoSitePlus
+PhylomeDB
+PlantReactome
+PomBase
+ProMEX
+Proteomes
+ProteomicsDB
+PseudoCAP
+Pumba
+REBASE
+REPRODUCTION-2DPAGE
+RGD
+RNAct
+Reactome
+SABIO-RK
+SASBDB
+SFLD
+SGD
+SIGNOR
+SMART
+SMR
+STRENDA-DB
+STRING
+SUPFAM
+SignaLink
+SwissLipids
+SwissPalm
+TAIR
+TCDB
+TopDownProteomics
+TubercuList
+UCSC
+UniLectin
+UniPathway
+UniProt
+VEuPathDB
+VGNC
+WBParaSite
+WormBase
+Xenbase
+YCharOS
+ZFIN
+dictyBase
+eggNOG
+ensembl
+euHCVdb
+genbank
+iPTMnet
+jPOST
+refseq
\ No newline at end of file
diff --git a/notebooks/uniprot_prefix_investigation/data/prefixes.txt b/notebooks/uniprot_prefix_investigation/data/prefixes.txt
new file mode 100644
index 0000000..1260d90
--- /dev/null
+++ b/notebooks/uniprot_prefix_investigation/data/prefixes.txt
@@ -0,0 +1,103 @@
+Allergome
+ArachnoServer
+Araport
+BioCyc
+BioGRID
+BioMuta
+CCDS
+CGD
+CPTAC
+CRC64
+ChEMBL
+ChiTaRS
+CollecTF
+ComplexPortal
+ConoServer
+DIP
+DMDM
+DNASU
+DisProt
+DrugBank
+EMBL
+EMBL-CDS
+EMDB
+ESTHER
+EchoBASE
+Ensembl
+EnsemblGenome
+EnsemblGenome_PRO
+EnsemblGenome_TRS
+Ensembl_PRO
+Ensembl_TRS
+FlyBase
+GI
+GeneCards
+GeneID
+GeneReviews
+GeneTree
+GeneWiki
+Gene_Name
+Gene_ORFName
+Gene_OrderedLocusName
+Gene_Synonym
+GenomeRNAi
+GlyConnect
+GuidetoPHARMACOLOGY
+HGNC
+HOGENOM
+IDEAL
+JaponicusDB
+KEGG
+LegioList
+Leproma
+MEROPS
+MGI
+MIM
+MINT
+MaizeGDB
+NCBI_TaxID
+OMA
+OpenTargets
+Orphanet
+OrthoDB
+PATRIC
+PDB
+PHI-base
+PeroxiBase
+PharmGKB
+PlantReactome
+PomBase
+ProteomicsDB
+PseudoCAP
+REBASE
+RGD
+Reactome
+RefSeq
+RefSeq_NT
+SGD
+STRING
+SwissLipids
+TAIR
+TCDB
+TreeFam
+TubercuList
+UCSC
+UniParc
+UniPathway
+UniProtKB-ID
+UniRef100
+UniRef50
+UniRef90
+VEuPathDB
+VGNC
+WBParaSite
+WBParaSite_TRS_PRO
+WormBase
+WormBase_PRO
+WormBase_TRS
+Xenbase
+ZFIN
+dictyBase
+eggNOG
+euHCVdb
+neXtProt
diff --git a/notebooks/uniprot_prefix_investigation/data/uniprot_prefix_remapping.json b/notebooks/uniprot_prefix_investigation/data/uniprot_prefix_remapping.json
new file mode 100644
index 0000000..61bd31b
--- /dev/null
+++ b/notebooks/uniprot_prefix_investigation/data/uniprot_prefix_remapping.json
@@ -0,0 +1,661 @@
+[
+ {
+ "__prefix": "Allergome",
+ "_status": "exact",
+ "match": "allergome"
+ },
+ {
+ "__prefix": "ArachnoServer",
+ "_status": "exact",
+ "match": "arachnoserver"
+ },
+ {
+ "__prefix": "Araport",
+ "_status": "exact",
+ "match": "araport"
+ },
+ {
+ "__prefix": "BioCyc",
+ "_status": "exact",
+ "match": "biocyc"
+ },
+ {
+ "__prefix": "BioGRID",
+ "_status": "exact",
+ "match": "biogrid"
+ },
+ {
+ "__prefix": "CCDS",
+ "_status": "exact",
+ "match": "ccds"
+ },
+ {
+ "__prefix": "CGD",
+ "_status": "exact",
+ "match": "cgd"
+ },
+ {
+ "__prefix": "ChEMBL",
+ "_status": "exact",
+ "match": "chembl"
+ },
+ {
+ "__prefix": "ComplexPortal",
+ "_status": "exact",
+ "match": "complexportal"
+ },
+ {
+ "__prefix": "ConoServer",
+ "_status": "exact",
+ "match": "conoserver"
+ },
+ {
+ "__prefix": "CRC64",
+ "_status": "UniProt_entry",
+ "comment": "Information from UniProt entry",
+ "match": "CRC64"
+ },
+ {
+ "__prefix": "dictyBase",
+ "_status": "exact",
+ "match": "dictybase"
+ },
+ {
+ "__prefix": "DIP",
+ "_status": "exact",
+ "match": "dip"
+ },
+ {
+ "__prefix": "DisProt",
+ "_status": "exact",
+ "match": "disprot"
+ },
+ {
+ "__prefix": "DrugBank",
+ "_status": "exact",
+ "match": "drugbank"
+ },
+ {
+ "__prefix": "EchoBASE",
+ "_status": "exact",
+ "match": "echobase"
+ },
+ {
+ "__prefix": "eggNOG",
+ "_status": "exact",
+ "match": "eggnog"
+ },
+ {
+ "__prefix": "EMDB",
+ "_status": "exact",
+ "match": "emdb"
+ },
+ {
+ "__prefix": "Ensembl",
+ "_status": "exact",
+ "match": "ensembl"
+ },
+ {
+ "__prefix": "FlyBase",
+ "_status": "exact",
+ "match": "FlyBase"
+ },
+ {
+ "__prefix": "Gene_Name",
+ "_status": "UniProt_entry",
+ "comment": "Information from UniProt entry",
+ "match": "Gene_Name"
+ },
+ {
+ "__prefix": "Gene_OrderedLocusName",
+ "_status": "UniProt_entry",
+ "comment": "Information from UniProt entry",
+ "match": "Gene_OrderedLocusName"
+ },
+ {
+ "__prefix": "Gene_ORFName",
+ "_status": "UniProt_entry",
+ "comment": "Information from UniProt entry",
+ "match": "Gene_ORFName"
+ },
+ {
+ "__prefix": "Gene_Synonym",
+ "_status": "UniProt_entry",
+ "comment": "Information from UniProt entry",
+ "match": "Gene_Synonym"
+ },
+ {
+ "__prefix": "GeneCards",
+ "_status": "synonym",
+ "matches": [
+ "genecards.gene"
+ ]
+ },
+ {
+ "__prefix": "GeneID",
+ "_status": "synonym",
+ "matches": [
+ "NCBIGene"
+ ]
+ },
+ {
+ "__prefix": "GeneTree",
+ "_status": "exact",
+ "match": "genetree"
+ },
+ {
+ "__prefix": "GeneWiki",
+ "_status": "exact",
+ "match": "genewiki"
+ },
+ {
+ "__prefix": "GI",
+ "_status": "map",
+ "matches": [
+ "ncbigi"
+ ]
+ },
+ {
+ "__prefix": "HGNC",
+ "_status": "exact",
+ "match": "hgnc"
+ },
+ {
+ "__prefix": "HOGENOM",
+ "_status": "exact",
+ "match": "hogenom"
+ },
+ {
+ "__prefix": "IDEAL",
+ "_status": "exact",
+ "match": "ideal"
+ },
+ {
+ "__prefix": "KEGG",
+ "_status": "exact",
+ "match": "kegg"
+ },
+ {
+ "__prefix": "MaizeGDB",
+ "_status": "synonym",
+ "matches": [
+ "maizegdb.locus"
+ ]
+ },
+ {
+ "__prefix": "MEROPS",
+ "_status": "map",
+ "matches": [
+ "merops.entry"
+ ]
+ },
+ {
+ "__prefix": "MGI",
+ "_status": "exact",
+ "match": "MGI"
+ },
+ {
+ "__prefix": "MIM",
+ "_status": "synonym",
+ "matches": [
+ "omim"
+ ]
+ },
+ {
+ "__prefix": "MINT",
+ "_status": "exact",
+ "match": "mint"
+ },
+ {
+ "__prefix": "NCBI_TaxID",
+ "_status": "synonym",
+ "matches": [
+ "NCBITaxon"
+ ]
+ },
+ {
+ "__prefix": "neXtProt",
+ "_status": "exact",
+ "match": "nextprot"
+ },
+ {
+ "__prefix": "Orphanet",
+ "_status": "synonym",
+ "matches": [
+ "ORPHA"
+ ]
+ },
+ {
+ "__prefix": "OrthoDB",
+ "_status": "exact",
+ "match": "orthodb"
+ },
+ {
+ "__prefix": "PDB",
+ "_status": "exact",
+ "match": "pdb"
+ },
+ {
+ "__prefix": "PeroxiBase",
+ "_status": "exact",
+ "match": "peroxibase"
+ },
+ {
+ "__prefix": "PharmGKB",
+ "_status": "map",
+ "matches": [
+ "pharmgkb.gene"
+ ]
+ },
+ {
+ "__prefix": "PomBase",
+ "_status": "exact",
+ "match": "pombase"
+ },
+ {
+ "__prefix": "Reactome",
+ "_status": "exact",
+ "match": "reactome"
+ },
+ {
+ "__prefix": "REBASE",
+ "_status": "exact",
+ "match": "rebase"
+ },
+ {
+ "__prefix": "RefSeq",
+ "_status": "exact",
+ "match": "refseq"
+ },
+ {
+ "__prefix": "RefSeq_NT",
+ "_status": "exact",
+ "match": "nucleotide"
+ },
+ {
+ "__prefix": "RGD",
+ "_status": "exact",
+ "match": "rgd"
+ },
+ {
+ "__prefix": "SGD",
+ "_status": "exact",
+ "match": "sgd"
+ },
+ {
+ "__prefix": "STRING",
+ "_status": "exact",
+ "match": "string"
+ },
+ {
+ "__prefix": "SwissLipids",
+ "_status": "synonym",
+ "matches": [
+ "SLM"
+ ]
+ },
+ {
+ "__prefix": "TAIR",
+ "_status": "map",
+ "matches": [
+ "tair.locus"
+ ]
+ },
+ {
+ "__prefix": "TCDB",
+ "_status": "exact",
+ "match": "tcdb"
+ },
+ {
+ "__prefix": "TreeFam",
+ "_status": "exact",
+ "match": "treefam"
+ },
+ {
+ "__prefix": "TubercuList",
+ "_status": "synonym",
+ "matches": [
+ "myco.tuber"
+ ]
+ },
+ {
+ "__prefix": "UCSC",
+ "_status": "exact",
+ "match": "ucsc"
+ },
+ {
+ "__prefix": "UniParc",
+ "_status": "exact",
+ "match": "uniparc"
+ },
+ {
+ "__prefix": "UniPathway",
+ "_status": "synonym",
+ "matches": [
+ "UPA"
+ ]
+ },
+ {
+ "__prefix": "UniProtKB-ID",
+ "_status": "exact",
+ "match": "uniprot"
+ },
+ {
+ "__prefix": "UniRef100",
+ "_status": "exact",
+ "match": "uniref"
+ },
+ {
+ "__prefix": "UniRef50",
+ "_status": "exact",
+ "match": "uniref"
+ },
+ {
+ "__prefix": "UniRef90",
+ "_status": "exact",
+ "match": "uniref"
+ },
+ {
+ "__prefix": "VGNC",
+ "_status": "exact",
+ "match": "vgnc"
+ },
+ {
+ "__prefix": "WormBase",
+ "_status": "exact",
+ "match": "WormBase"
+ },
+ {
+ "__prefix": "Xenbase",
+ "_status": "exact",
+ "match": "xenbase"
+ },
+ {
+ "__prefix": "ZFIN",
+ "_status": "exact",
+ "match": "zfin"
+ },
+ {
+ "__prefix": "BioMuta",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry"
+ ]
+ },
+ {
+ "__prefix": "ChiTaRS",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry"
+ ]
+ },
+ {
+ "__prefix": "CollecTF",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry"
+ ]
+ },
+ {
+ "__prefix": "CPTAC",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry"
+ ]
+ },
+ {
+ "__prefix": "DMDM",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry"
+ ]
+ },
+ {
+ "__prefix": "DNASU",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry"
+ ]
+ },
+ {
+ "__prefix": "EMBL",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry",
+ "Prefix found in Bioregistry file contents"
+ ]
+ },
+ {
+ "__prefix": "ESTHER",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry"
+ ]
+ },
+ {
+ "__prefix": "euHCVdb",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry",
+ "Prefix found in Bioregistry file contents"
+ ]
+ },
+ {
+ "__prefix": "GeneReviews",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry"
+ ]
+ },
+ {
+ "__prefix": "GenomeRNAi",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry"
+ ]
+ },
+ {
+ "__prefix": "GlyConnect",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry"
+ ]
+ },
+ {
+ "__prefix": "GuidetoPHARMACOLOGY",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry",
+ "Prefix found in Bioregistry file contents"
+ ]
+ },
+ {
+ "__prefix": "JaponicusDB",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry"
+ ]
+ },
+ {
+ "__prefix": "LegioList",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry"
+ ]
+ },
+ {
+ "__prefix": "Leproma",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry"
+ ]
+ },
+ {
+ "__prefix": "OMA",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry",
+ "Prefix found in Bioregistry file contents"
+ ]
+ },
+ {
+ "__prefix": "OpenTargets",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry",
+ "Prefix found in Bioregistry file contents"
+ ]
+ },
+ {
+ "__prefix": "PATRIC",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry",
+ "Prefix found in Bioregistry file contents"
+ ]
+ },
+ {
+ "__prefix": "PHI-base",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry",
+ "Prefix found in Bioregistry file contents"
+ ]
+ },
+ {
+ "__prefix": "PlantReactome",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry",
+ "Prefix found in Bioregistry file contents"
+ ]
+ },
+ {
+ "__prefix": "ProteomicsDB",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry",
+ "Prefix found in Bioregistry file contents"
+ ]
+ },
+ {
+ "__prefix": "PseudoCAP",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry"
+ ]
+ },
+ {
+ "__prefix": "VEuPathDB",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry",
+ "Prefix found in Bioregistry file contents"
+ ]
+ },
+ {
+ "__prefix": "WBParaSite",
+ "_status": "UniProt_dblist",
+ "comment": [
+ "See UniProt dblist entry"
+ ]
+ },
+ {
+ "__prefix": "CRC64",
+ "_status": null,
+ "comment": [
+ "No information"
+ ]
+ },
+ {
+ "__prefix": "EMBL-CDS",
+ "_status": null,
+ "comment": [
+ "No information"
+ ]
+ },
+ {
+ "__prefix": "Ensembl_PRO",
+ "_status": null,
+ "comment": [
+ "No information"
+ ]
+ },
+ {
+ "__prefix": "Ensembl_TRS",
+ "_status": null,
+ "comment": [
+ "No information"
+ ]
+ },
+ {
+ "__prefix": "EnsemblGenome",
+ "_status": null,
+ "comment": [
+ "Prefix found in Bioregistry file contents"
+ ]
+ },
+ {
+ "__prefix": "EnsemblGenome_PRO",
+ "_status": null,
+ "comment": [
+ "No information"
+ ]
+ },
+ {
+ "__prefix": "EnsemblGenome_TRS",
+ "_status": null,
+ "comment": [
+ "No information"
+ ]
+ },
+ {
+ "__prefix": "Gene_Name",
+ "_status": null,
+ "comment": [
+ "Prefix found in Bioregistry file contents"
+ ]
+ },
+ {
+ "__prefix": "Gene_OrderedLocusName",
+ "_status": null,
+ "comment": [
+ "No information"
+ ]
+ },
+ {
+ "__prefix": "Gene_ORFName",
+ "_status": null,
+ "comment": [
+ "No information"
+ ]
+ },
+ {
+ "__prefix": "Gene_Synonym",
+ "_status": null,
+ "comment": [
+ "No information"
+ ]
+ },
+ {
+ "__prefix": "WBParaSite_TRS_PRO",
+ "_status": null,
+ "comment": [
+ "No information"
+ ]
+ },
+ {
+ "__prefix": "WormBase_PRO",
+ "_status": null,
+ "comment": [
+ "No information"
+ ]
+ },
+ {
+ "__prefix": "WormBase_TRS",
+ "_status": null,
+ "comment": [
+ "No information"
+ ]
+ }
+]
\ No newline at end of file
diff --git a/notebooks/uniprot_prefix_investigation/prefix_remapper_investigation.ipynb b/notebooks/uniprot_prefix_investigation/prefix_remapper_investigation.ipynb
new file mode 100644
index 0000000..b7643ee
--- /dev/null
+++ b/notebooks/uniprot_prefix_investigation/prefix_remapper_investigation.ipynb
@@ -0,0 +1,1604 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "9233cbb4-85c2-4cf1-8eae-580830572938",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from pyspark.sql import SparkSession\n",
+ "from pyspark.sql.functions import col, lower, udf\n",
+ "from pyspark.sql.types import StructType, StructField, StringType, BooleanType\n",
+ "from pathlib import Path\n",
+ "from collections import Counter, defaultdict\n",
+ "import json\n",
+ "import gzip\n",
+ "import bioregistry as br\n",
+ "\n",
+ "from berdl_notebook_utils.setup_spark_session import get_spark_session\n",
+ "\n",
+ "spark = get_spark_session(local=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "907178fb-271a-41df-8fa2-21f9078b23c1",
+ "metadata": {},
+ "source": [
+ "Load BioRegistry and Remapping"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "8a5e3ce4-557c-4528-be58-0c4934cd3782",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "registry_set = set()\n",
+ "\n",
+ "for r in br.resources():\n",
+ " registry_set.add(r.prefix.lower())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "6741d766-57a0-425c-932a-3162f3bcc3cf",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Registry entries:2569\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(f\"Registry entries:{len(registry_set)}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "e0ce1c66-0eb5-4a8b-94c3-8c09843dfb1b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "MAPPING_PATH = Path(\"uniprot_prefix_remapping.json\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "27e6c78b-17ab-40e3-8025-3032b43f3dbd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def load_mapping(path: Path) -> list:\n",
+ " with open(path) as f:\n",
+ " return json.load(f)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "1bde584c-1bc6-4121-8238-79beed1dec5e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "mapping = load_mapping(MAPPING_PATH)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "193caba9-2857-4a52-a721-446e6d2ada35",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Remapping entries: 108\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(f\"Remapping entries: {len(mapping)}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "637b3c5e-23c0-4477-b0ae-16bc2d3ea96f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# REGISTRY_PATH = Path(\"registry.json\")\n",
+ "\n",
+ "# def load_registry(path: Path) -> dict:\n",
+ "# with open(path) as f:\n",
+ "# return json.load(f)\n",
+ "\n",
+ "# registry = load_registry(REGISTRY_PATH)\n",
+ "# print(f\"Registry entries: {len(registry)}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "e7a4577a-0f85-4161-bad0-48379a992d7d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "All keys are present in mapping file:\n",
+ "{'match', 'comment', '_status', 'matches', '__prefix'}\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Inspect remapping file structure\n",
+ "\n",
+ "all_keys = set()\n",
+ "\n",
+ "for row in mapping:\n",
+ " all_keys.update(row.keys())\n",
+ "\n",
+ "print(\"All keys are present in mapping file:\")\n",
+ "print(all_keys)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "795f9af0-e3e9-428e-a79c-a85df4078d91",
+ "metadata": {},
+ "source": [
+ "\n",
+ "- `__prefix` – the original prefix\n",
+ "- `_status` – classification\n",
+ "- `match` / `matches` – canonical BioRegistry targets\n",
+ "- `comment` – explanatory notes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "12030966-1aa2-459d-8845-daf532f2077a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Remapping status categories:\n",
+ "None: 14\n",
+ "UniProt_dblist: 25\n",
+ "UniProt_entry: 5\n",
+ "exact: 51\n",
+ "map: 4\n",
+ "synonym: 9\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Remapping status distribution\n",
+ "\n",
+ "status_counts = Counter(row.get(\"_status\") for row in mapping)\n",
+ "\n",
+ "print(\"Remapping status categories:\")\n",
+ "for status, count in sorted(status_counts.items(), key=lambda x: str(x[0])):\n",
+ " print(f\"{status}: {count}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "a8105639-bace-4a74-a188-442d77d8eefb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Canonical target valiadation\n",
+ "\n",
+ "\n",
+ "def standardize_namespace_identifiers(mapping: list) -> set:\n",
+ " standardized_namespaces = set()\n",
+ "\n",
+ " for row in mapping:\n",
+ " if row.get(\"match\"):\n",
+ " standardized_namespaces.add(row[\"match\"].strip().lower())\n",
+ " if row.get(\"matches\"):\n",
+ " for m in row[\"matches\"]:\n",
+ " standardized_namespaces.add(m.strip().lower())\n",
+ "\n",
+ " return standardized_namespaces"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "62e65373-e498-4232-aae6-de20adde060b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "standardize_namespaces = standardize_namespace_identifiers(mapping)\n",
+ "invalid_targets = standardize_namespaces - registry_set"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "33a468ea-dc35-4738-a14e-28ecbc2b3253",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "namespace identifiers missing in BioRegistry:\n",
+ "['crc64', 'gene_name', 'gene_orderedlocusname', 'gene_orfname', 'gene_synonym']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"namespace identifiers missing in BioRegistry:\")\n",
+ "print(sorted(invalid_targets))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a5c26222-2060-4fd9-8eab-bbe04e110efa",
+ "metadata": {},
+ "source": [
+ "Some canonical targets referenced in the remapping file are not present\n",
+ "in the BioRegistry. These represent governance gaps and require follow-up."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0c5b5346-8397-4e2e-b17f-6bf54fc2ea5d",
+ "metadata": {},
+ "source": [
+ "### Interpret the results\n",
+ "\n",
+ "These identifiers are not external database namespaces but rather annotation fields from UniProt records, such as gene name metadata or checksum fields. Therefore, they are expected to be absent from BioRegistry and do not represent true external identifiers.\n",
+ "\n",
+ "### Manual investigation of additional prefixes\n",
+ "\n",
+ "Beyond the automatically detected differences, we also manually reviewed other prefixes referenced in upstream datasets and mapping sources. Some of these represent legitimate biological databases that are not yet registered in BioRegistry.\n",
+ "\n",
+ "These prefixes are tracked separately as known governance gaps:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "75c436c8-7d91-49aa-ac61-04a8a453041f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "NOT_FOUND_PREFIXES = {\n",
+ " \"agr\",\n",
+ " \"alphafolddb\",\n",
+ " \"antibodypedia\",\n",
+ " \"bgee\",\n",
+ " \"biogrid-orcs\",\n",
+ " \"ctd\",\n",
+ " \"dnasu\",\n",
+ " \"esther\",\n",
+ " \"funfam\",\n",
+ " \"gene3d\",\n",
+ " \"gramene\",\n",
+ " \"ncbifam\",\n",
+ " \"patric\",\n",
+ " \"sfld\",\n",
+ " \"veupathdb\",\n",
+ " \"wbparasite\",\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "822e335c-cc02-4797-9cb6-93024bd33e35",
+ "metadata": {},
+ "source": [
+ "These prefixes require follow-up actions, such as:\n",
+ "\n",
+ "- registering them in BioRegistry\n",
+ "- defining canonical namespace mappings\n",
+ "- or documenting them as dataset-specific identifiers.\n",
+ "\n",
+ "### Summary\n",
+ "\n",
+ "The investigation confirmed that:\n",
+ "\n",
+ "- Most canonical targets in the remapping file align with BioRegistry namespaces.\n",
+ "- A small number of entries correspond to annotation fields rather than databases.\n",
+ "- A separate group of prefixes represents external resources not yet included in BioRegistry, which are tracked for governance follow-up."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "74014929-f583-4334-b9b9-26e93cbe46c8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "mapping_dict = {row[\"__prefix\"].lower(): row for row in mapping if isinstance(row.get(\"__prefix\"), str)}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "027ea84e-5e70-4342-a254-4dd2e39310a4",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "agr is not found in remapping file\n",
+ "alphafolddb is not found in remapping file\n",
+ "antibodypedia is not found in remapping file\n",
+ "bgee is not found in remapping file\n",
+ "biogrid-orcs is not found in remapping file\n",
+ "ctd is not found in remapping file\n",
+ "dnasu | status: UniProt_dblist\n",
+ "esther | status: UniProt_dblist\n",
+ "funfam is not found in remapping file\n",
+ "gene3d is not found in remapping file\n",
+ "gramene is not found in remapping file\n",
+ "ncbifam is not found in remapping file\n",
+ "patric | status: UniProt_dblist\n",
+ "sfld is not found in remapping file\n",
+ "veupathdb | status: UniProt_dblist\n",
+ "wbparasite | status: UniProt_dblist\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Check missing prefixes in remapping\n",
+ "\n",
+ "for p in sorted(NOT_FOUND_PREFIXES):\n",
+ " row = mapping_dict.get(p.lower())\n",
+ " if not row:\n",
+ " print(f\"{p} is not found in remapping file\")\n",
+ " else:\n",
+ " print(f\"{p:<20} | status: {row.get('_status')}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "069ca961-73bc-400b-9a44-a30cfe33cbba",
+ "metadata": {},
+ "source": [
+ "Summary:\n",
+ "\n",
+ "- Some prefixes (agr, alphafolddb, antibodypedia, bgee, biogrid-orcs, ctd, funfam, gene3d, gramene, ncbifam, sfld) are not in the remapping file.\n",
+ "- Other prefixes are marked as `UniProt_dblist` (annotation-level references).\n",
+ "- Some are synonyms or require subtype mapping.\n",
+ "\n",
+ "This confirms that a normalization layer is necessary."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "94d3591e-2723-4663-a92b-db8c1db221b8",
+ "metadata": {},
+ "source": [
+ "## Key Findings\n",
+ "\n",
+ "1. UniProt links to multiple external databases that do not use canonical BioRegistry prefixes.\n",
+ "2. Some namespaces collapse subtypes (e.g., PANTHER → panther.family, panther.node, panther.pathway, panther.pthcmp).\n",
+ "3. Several databases linked from UniProt are not present in BioRegistry.\n",
+ "4. Some prefixes represent annotation sources rather than identifier namespaces.\n",
+ "5. A normalization transformer is required to ensure namespace governance.\n",
+ "\n",
+ "We implemented a Spark-based prefix normalization transformer that:\n",
+ "\n",
+ "- Enforces canonical BioRegistry prefixes\n",
+ "- Applies subtype mappings where required\n",
+ "- Detects and flags registry gaps\n",
+ "- Fails fast on unclassified prefixes\n",
+ "\n",
+ "Output dataset fields:\n",
+ "- `db_normalized`\n",
+ "- `prefix_category`\n",
+ "- `is_registry_gap`\n",
+ "\n",
+ "This ensures downstream ingestion pipelines operate on\n",
+ "standardized and governance-ready prefixes.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4adee2bd-81ba-48d2-9d72-ea6a79760f68",
+ "metadata": {},
+ "source": [
+ "# UniProt Prefix Governance Investigation\n",
+ "\n",
+ "Investigates:\n",
+ "1. The namespace universe present in UniProt cross-references\n",
+ "2. The namespace universe present in UniProt idmapping.dat\n",
+ "3. Overlap and differences\n",
+ "4. Coverage against Bioregistry\n",
+ "5. Proposed strategy for implementing a prefix remapper"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "79716123-8f73-4dea-968b-c6984f3dad61",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "PARQUET_SOURCE = \"s3a://cdm-lake/tenant-general-warehouse/kbase/datasets/uniprot/uniprot_kb/identifier\"\n",
+ "\n",
+ "df = spark.read.parquet(PARQUET_SOURCE)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "29066c00-b6a1-46ad-a2cd-6661583ecb4e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+------------------+-------+--------+-----------+------------------+--------------+------------+\n",
+ "| entity_id| db| xref|description| _dlt_load_id| _dlt_id|relationship|\n",
+ "+------------------+-------+--------+-----------+------------------+--------------+------------+\n",
+ "|uniprot:A0A068QWH9| PRINTS| PR00368| NULL|1770728436.7741342|drstc13RmvdHag| NULL|\n",
+ "|uniprot:A0A068QWH9| PRINTS| PR00411| NULL|1770728436.7741342|MPVeMCDjxAJ89Q| NULL|\n",
+ "|uniprot:A0A068QWH9| SUPFAM|SSF51905| NULL|1770728436.7741342|VREQxAb6fbK+BQ| NULL|\n",
+ "|uniprot:A0A068QWH9| SUPFAM|SSF55424| NULL|1770728436.7741342|ekRrV/FUJ73c2Q| NULL|\n",
+ "|uniprot:A0A068QWH9|PROSITE| PS00076| NULL|1770728436.7741342|kuBN643V/sWyng| NULL|\n",
+ "+------------------+-------+--------+-----------+------------------+--------------+------------+\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "df.limit(5).show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "03bb0997-6fa1-4dad-a95b-4d4b99ca6c5e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "prefix_df = df.select(lower(col(\"db\")).alias(\"db\")).where(col(\"db\").isNotNull()).limit(1000).distinct()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "e63e2376-3c1d-49c8-aac9-679089a7f308",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+---------+\n",
+ "| db|\n",
+ "+---------+\n",
+ "| panther|\n",
+ "| pfam|\n",
+ "| supfam|\n",
+ "|ncbitaxon|\n",
+ "| uniprot|\n",
+ "+---------+\n",
+ "only showing top 5 rows\n"
+ ]
+ }
+ ],
+ "source": [
+ "prefix_df.show(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "0642a9d0-2350-4eb3-94cc-6fd4927892e1",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "32"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "parquet_set = {row.db for row in prefix_df.collect()}\n",
+ "len(parquet_set)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8f8d099f-687c-4d2b-a886-db6622156234",
+ "metadata": {},
+ "source": [
+ "\n",
+ "The Parquet dataset contains **36 unique database prefixes**.\n",
+ "These represent the full namespace universe extracted from UniProt cross-references."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "2da657fd-af66-4200-877d-04ba4241e053",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from pathlib import Path"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "4eb978fa-1f41-451c-94f3-e2e27f4c7258",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "103"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "idmapping_path = Path(\"prefixes.txt\")\n",
+ "\n",
+ "idmapping_set = set()\n",
+ "\n",
+ "with open(idmapping_path, \"rt\") as f:\n",
+ " idmapping_set = {line.strip().lower() for line in f if line.strip()}\n",
+ "\n",
+ "len(idmapping_set)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "98745afe-9493-4ec8-a6b2-2dc079d99c9f",
+ "metadata": {},
+ "source": [
+ "The idmapping file contains **103 unique ID_type prefixes**."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "0ac87cdb-5e48-4b7e-9f8d-be7f3bf062bf",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(10, 22, 93)"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "shared = parquet_set & idmapping_set\n",
+ "only_parquet = parquet_set - idmapping_set\n",
+ "only_idmapping = idmapping_set - parquet_set\n",
+ "\n",
+ "len(shared), len(only_parquet), len(only_idmapping)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2b7761f3-e795-496e-9890-35aa1a3d771a",
+ "metadata": {},
+ "source": [
+ "\n",
+ "| Category | Count |\n",
+ "|----------|-------|\n",
+ "| Shared | 11 |\n",
+ "| Only in Parquet | 25 |\n",
+ "| Only in idmapping | 92 |\n",
+ "\n",
+ "\n",
+ "The Parquet namespace is significantly larger than the idmapping. Therefore, idmapping.dat is NOT a complete namespace authority."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "260e86df-b48c-40f0-a638-511348f0ab2e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Valid in registry: 21\n",
+ "Missing in registry: 11\n",
+ "Missing sample: ['alphafolddb', 'funfam', 'gene3d', 'geneid', 'ncbifam', 'panther', 'patric', 'proteomes', 'smr', 'unipathway', 'veupathdb']\n"
+ ]
+ }
+ ],
+ "source": [
+ "valid = parquet_set & registry_set\n",
+ "missing = parquet_set - registry_set\n",
+ "\n",
+ "print(\"Valid in registry:\", len(valid))\n",
+ "print(\"Missing in registry:\", len(missing))\n",
+ "print(\"Missing sample:\", sorted(list(missing))[:20])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "24f38c8d-7d71-4f8a-aed1-dcff42b669ab",
+ "metadata": {},
+ "source": [
+ "\n",
+ "\n",
+ "| Category | Count |\n",
+ "|----------|-------|\n",
+ "| Valid | 20 |\n",
+ "| Not Found | 16 |\n",
+ "\n",
+ "Nearly half of the namespaces used by UniProt are not present in Bioregistry."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "92ee97b4-dd09-4be9-8cf7-71e439054261",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Registry size: 2569\n",
+ "panther in registry_set: False\n",
+ "panther-related: ['panther.pthcmp', 'panther.pathway', 'panther.family', 'panther.node']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Registry size:\", len(registry_set))\n",
+ "print(\"panther in registry_set:\", \"panther\" in registry_set)\n",
+ "print(\"panther-related:\", [x for x in registry_set if \"panther\" in x][:20])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b4b9d5f8-6fab-4e2a-94e4-38d1477862ba",
+ "metadata": {},
+ "source": [
+ "The missing prefixes fall into multiple categories:\n",
+ "\n",
+ "1. Subtype namespaces (e.g., ensemblplants, ensemblbacteria)\n",
+ "2. Annotation sources (e.g., expressionatlas)\n",
+ "3. UniProt dblist-only databases\n",
+ "4. Databases not yet registered in Bioregistry"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0e8b7bd8-f2f9-4673-94ea-d0ca92635f6e",
+ "metadata": {},
+ "source": [
+ "Conclusion:\n",
+ "\n",
+ "1. idmapping.dat does not represent the complete identifier namespace universe.\n",
+ "2. UniProt cross-references contain many additional databases.\n",
+ "3. Not all database names represent true identifier namespaces.\n",
+ "4. Bioregistry does not fully cover UniProt dblist databases.\n",
+ "\n",
+ "A prefix remapper must:\n",
+ "- Normalize synonyms\n",
+ "- Collapse subtype namespaces\n",
+ "- Distinguish annotation sources from identifier namespaces\n",
+ "- Explicitly track registry gaps\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ba04228d-2f78-40a9-ab81-545d98174aa2",
+ "metadata": {},
+ "source": [
+ "## Prefix normalization "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6d2958a6-c817-4955-be8d-2fb6e8831fe1",
+ "metadata": {},
+ "source": [
+ "Some exports used in UniProt are aliases for the BioRegistry specification namespace.\n",
+ "\n",
+ "These aliases are determined by comparing UniProt database names with known BioRegistry specifications.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "af1f9ec6-4ba6-48f9-9b29-c576d6d4cc48",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "SYNONYM_MAP = {\n",
+ " \"geneid\": \"ncbigene\",\n",
+ " \"unipathway\": \"upa\",\n",
+ " \"ctd\": \"ctd.gene\",\n",
+ " \"gramene\": \"gramene.gene\",\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "345854ed-8a0d-4b8e-8d8c-d0dcad417e3e",
+ "metadata": {},
+ "source": [
+ "Certain databases in BioRegistry are represented by subtype namespaces rather than a single flat prefix.\n",
+ "\n",
+ "To align with BioRegistry, such prefixes are mapped to a default subtype namespace."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "76a168f4-a8dd-4007-9af1-d7854679d7db",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "MAP_NAMESPACE = {\n",
+ " \"merops\": \"merops.entry\",\n",
+ " \"ensemblbacteria\": \"ensembl\",\n",
+ " \"ensemblmetazoa\": \"ensembl\",\n",
+ " \"ensemblplants\": \"ensembl\",\n",
+ " \"panther\": \"panther.family\",\n",
+ " \"pro\": \"pr\",\n",
+ " \"oma\": \"oma.protein\",\n",
+ " \"paxdb\": \"paxdb.protein\",\n",
+ " \"pir\": \"pirsf\",\n",
+ " \"peptideatlas\": \"peptideatlas.peptide\",\n",
+ " \"proteomicsdb\": \"proteomicsdb.protein\",\n",
+ " \"proteomes\": \"uniprot.proteome\",\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a4dd90b7-0b68-4959-87a4-070ed249d5cc",
+ "metadata": {},
+ "source": [
+ "Some db values represent external annotation providers rather than identifier namespaces.\n",
+ "\n",
+ "Indicators include:\n",
+ "\n",
+ "\t•\tThe xref value equals the UniProt accession\n",
+ "\t•\tThe database does not introduce an independent identifier system\n",
+ "\t•\tThe database primarily provides metadata or annotations"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "2c51ff71-712b-44c5-95dc-a95ba730ab50",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ANNOTATION_SOURCE = {\n",
+ " \"expressionatlas\",\n",
+ " \"funcoup\",\n",
+ " \"glycosmos\",\n",
+ " \"glygen\",\n",
+ " \"inparanoid\",\n",
+ " \"iptmnet\",\n",
+ " \"metosite\",\n",
+ " \"phosphositeplus\",\n",
+ " \"smr\",\n",
+ " \"swisspalm\",\n",
+ " \"topdownproteomics\",\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4de68d60-2639-4960-a1c6-197fcf64afe4",
+ "metadata": {},
+ "source": [
+ "Certain prefixes represent internal metadata fields within UniProt records rather than external databases."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "27d68e7a-955a-4bfe-8e36-1707bc20aca0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "INTERNAL_METADATA = {\n",
+ " \"gene_name\",\n",
+ " \"gene_orfname\",\n",
+ " \"gene_orderedlocusname\",\n",
+ " \"crc64\",\n",
+ " \"uniprotkb-id\",\n",
+ " \"ensemblgenome_pro\",\n",
+ " \"ensemblgenome_trs\",\n",
+ " \"ensemblgenome\",\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f28b6611-10ad-41e8-addc-40d0fa49c813",
+ "metadata": {},
+ "source": [
+ "Some observed prefixes correspond to real biological databases but are not currently registered in BioRegistry.\n",
+ "\n",
+ "These prefixes require governance follow-up, such as:\n",
+ "\n",
+ "\t•\tregistering them in BioRegistry\n",
+ "\t•\tdefining canonical namespace mappings\n",
+ "\t•\tdocumenting them as dataset-specific identifiers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "2a61f49f-33f1-4e1d-ab83-62c175e3140f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "REGISTRY_GAP = {\n",
+ " \"collectf\",\n",
+ " \"alphafolddb\",\n",
+ " \"agr\",\n",
+ " \"antibodypedia\",\n",
+ " \"bgee\",\n",
+ " \"biogrid-orcs\",\n",
+ " \"dnasu\",\n",
+ " \"esther\",\n",
+ " \"funfam\",\n",
+ " \"gene3d\",\n",
+ " \"ncbifam\",\n",
+ " \"patric\",\n",
+ " \"sfld\",\n",
+ " \"veupathdb\",\n",
+ " \"wbparasite\",\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6cff6ae3-54b0-4f84-8072-14296ef68c4a",
+ "metadata": {},
+ "source": [
+ "The classification is implemented through a rule-based normalization function:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "e2c0393c-1ebb-49a9-9a5d-fe51b017d5f9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def normalize_prefix(db: str | None, registry_set: set[str]) -> dict:\n",
+ " if db is None:\n",
+ " return {\"normalized\": None, \"category\": \"null\", \"is_registry_gap\": False}\n",
+ "\n",
+ " key = db.strip().lower()\n",
+ " if not key:\n",
+ " return {\"normalized\": None, \"category\": \"null\", \"is_registry_gap\": False}\n",
+ "\n",
+ " if key in INTERNAL_METADATA:\n",
+ " return {\"normalized\": None, \"category\": \"internal\", \"is_registry_gap\": False}\n",
+ "\n",
+ " if key in ANNOTATION_SOURCE:\n",
+ " return {\"normalized\": key, \"category\": \"annotation\", \"is_registry_gap\": False}\n",
+ "\n",
+ " if key in SYNONYM_MAP:\n",
+ " normalized = SYNONYM_MAP[key]\n",
+ " return {\"normalized\": normalized, \"category\": \"synonym\", \"is_registry_gap\": normalized not in registry_set}\n",
+ "\n",
+ " if key in MAP_NAMESPACE:\n",
+ " normalized = MAP_NAMESPACE[key]\n",
+ " return {\"normalized\": normalized, \"category\": \"map\", \"is_registry_gap\": normalized not in registry_set}\n",
+ "\n",
+ " if key in registry_set:\n",
+ " return {\"normalized\": key, \"category\": \"exact\", \"is_registry_gap\": False}\n",
+ "\n",
+ " if key in REGISTRY_GAP:\n",
+ " return {\"normalized\": key, \"category\": \"registry_gap\", \"is_registry_gap\": True}\n",
+ " return {\"normalized\": key, \"category\": \"registry_gap\", \"is_registry_gap\": True}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "63a2aeca-23d1-4c6d-bc8b-6de53af1245b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Classification preview\n",
+ "\n",
+ "results = []\n",
+ "category_buckets = defaultdict(list)\n",
+ "\n",
+ "for db_name in sorted(parquet_set):\n",
+ " r = normalize_prefix(db_name, registry_set)\n",
+ " results.append((db_name, r[\"category\"], r[\"normalized\"], r[\"is_registry_gap\"]))\n",
+ " category_buckets[r[\"category\"]].append(db_name)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "7b255ee5-aad6-4fa3-933e-3ef6729f7e4e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Category Summary: \n",
+ "registry_gap : 6\n",
+ "map : 2\n",
+ "synonym : 2\n",
+ "exact : 21\n",
+ "annotation : 1\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Category Summary: \")\n",
+ "for k in [\"registry_gap\", \"map\", \"synonym\", \"exact\", \"annotation\", \"internal\", \"null\"]:\n",
+ " if k in category_buckets:\n",
+ " print(f\"{k:12} : {len(category_buckets[k])}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ccfff193-2e29-4e96-b8c1-8e1f79f87901",
+ "metadata": {},
+ "source": [
+ "## Sample Inspection\n",
+ "\n",
+ "To validate the correctness of the namespace classification rules, we inspected representative records for selected prefixes in the dataset. \n",
+ "\n",
+ "\n",
+ "This step helps confirm the semantic meaning of each namespace by examining the structure of the association identifier (`xref`) and its relationship to the UniProt login number. Association identifier (`xref`) and its relationship to the UniProt login number.\n",
+ "\n",
+ "The inspection was performed using the following helper function:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "74c723e0-19de-4ba8-870e-a69f3accc552",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+------------------+---------------+--------+-----------+\n",
+ "|entity_id |db |xref |description|\n",
+ "+------------------+---------------+--------+-----------+\n",
+ "|uniprot:A0A2Z2PK47|EnsemblBacteria|AAK90952|NULL |\n",
+ "|uniprot:A0A2Z2PIL3|EnsemblBacteria|AAK91061|NULL |\n",
+ "|uniprot:A0A2Z2PR14|EnsemblBacteria|AAK90982|NULL |\n",
+ "|uniprot:A0A2Z2PIH7|EnsemblBacteria|AAK91015|NULL |\n",
+ "|uniprot:O68019 |EnsemblBacteria|AAL46373|NULL |\n",
+ "|uniprot:Q7D2H4 |EnsemblBacteria|AAK91071|NULL |\n",
+ "|uniprot:Q6YRT9 |EnsemblBacteria|BAD02063|NULL |\n",
+ "|uniprot:Q6YRT9 |EnsemblBacteria|BAD02122|NULL |\n",
+ "|uniprot:Q6YRT8 |EnsemblBacteria|BAD02064|NULL |\n",
+ "|uniprot:Q6YRT8 |EnsemblBacteria|BAD02123|NULL |\n",
+ "+------------------+---------------+--------+-----------+\n",
+ "only showing top 10 rows\n",
+ "+------------------+-------+--------------+-----------+\n",
+ "|entity_id |db |xref |description|\n",
+ "+------------------+-------+--------------+-----------+\n",
+ "|uniprot:A0A068QWV2|PANTHER|PTHR31956:SF1 |NULL |\n",
+ "|uniprot:A0A068QWV2|PANTHER|PTHR31956 |NULL |\n",
+ "|uniprot:A0A1I0A2X9|PANTHER|PTHR40089:SF1 |NULL |\n",
+ "|uniprot:A0A1I0A2X9|PANTHER|PTHR40089 |NULL |\n",
+ "|uniprot:A0A1J3HKS4|PANTHER|PTHR31356:SF8 |NULL |\n",
+ "|uniprot:A0A1J3HKS4|PANTHER|PTHR31356 |NULL |\n",
+ "|uniprot:A0A3P3WYY6|PANTHER|PTHR10381 |NULL |\n",
+ "|uniprot:A0A3P3WYY6|PANTHER|PTHR10381:SF15|NULL |\n",
+ "|uniprot:A0A6I1B2L6|PANTHER|PTHR42812 |NULL |\n",
+ "|uniprot:A0A6I1B2L6|PANTHER|PTHR42812:SF12|NULL |\n",
+ "+------------------+-------+--------------+-----------+\n",
+ "only showing top 10 rows\n",
+ "+------------------+---------+-----------+-----------+\n",
+ "|entity_id |db |xref |description|\n",
+ "+------------------+---------+-----------+-----------+\n",
+ "|uniprot:A0A068QWV2|Proteomes|UP000032721|NULL |\n",
+ "|uniprot:A0A068QWV2|Proteomes|UP000324170|NULL |\n",
+ "|uniprot:A0A0H3J6T1|Proteomes|UP000028042|NULL |\n",
+ "|uniprot:A0A0H3J6T1|Proteomes|UP000030905|NULL |\n",
+ "|uniprot:A0A1I0A2X9|Proteomes|UP000198612|NULL |\n",
+ "|uniprot:A0A1I0A2X9|Proteomes|UP000199519|NULL |\n",
+ "|uniprot:A0A1I7SRR9|Proteomes|UP000095284|NULL |\n",
+ "|uniprot:A0A1I7SRR9|Proteomes|UP000582659|NULL |\n",
+ "|uniprot:A0A1I7SRR9|Proteomes|UP000659654|NULL |\n",
+ "|uniprot:A0A3E4JR41|Proteomes|UP000260640|NULL |\n",
+ "+------------------+---------+-----------+-----------+\n",
+ "only showing top 10 rows\n",
+ "+--------------+---------------+------+-----------+\n",
+ "|entity_id |db |xref |description|\n",
+ "+--------------+---------------+------+-----------+\n",
+ "|uniprot:B2RYC9|PhosphoSitePlus|B2RYC9|NULL |\n",
+ "|uniprot:Q5U2U4|PhosphoSitePlus|Q5U2U4|NULL |\n",
+ "|uniprot:Q80XX9|PhosphoSitePlus|Q80XX9|NULL |\n",
+ "|uniprot:B1WBV4|PhosphoSitePlus|B1WBV4|NULL |\n",
+ "|uniprot:Q80SX3|PhosphoSitePlus|Q80SX3|NULL |\n",
+ "|uniprot:Q66H87|PhosphoSitePlus|Q66H87|NULL |\n",
+ "|uniprot:B0BNM6|PhosphoSitePlus|B0BNM6|NULL |\n",
+ "|uniprot:Q5U2V2|PhosphoSitePlus|Q5U2V2|NULL |\n",
+ "|uniprot:B2RYC6|PhosphoSitePlus|B2RYC6|NULL |\n",
+ "|uniprot:D4A5N6|PhosphoSitePlus|D4A5N6|NULL |\n",
+ "+--------------+---------------+------+-----------+\n",
+ "only showing top 10 rows\n"
+ ]
+ }
+ ],
+ "source": [
+ "def show_sample(db_value: str, n: int = 10):\n",
+ " df.filter(lower(col(\"db\")) == db_value.lower()).select(\"entity_id\", \"db\", \"xref\", \"description\").show(\n",
+ " n, truncate=False\n",
+ " )\n",
+ "\n",
+ "\n",
+ "show_sample(\"ensemblbacteria\")\n",
+ "show_sample(\"panther\")\n",
+ "show_sample(\"proteomes\")\n",
+ "show_sample(\"phosphositeplus\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "debe021d-dde6-4496-92d8-09e4fc7f1aa4",
+ "metadata": {},
+ "source": [
+ "- EnsemblBacteria → ensembl: map\n",
+ "- panther.family: map\n",
+ "- Proteomes → uniprot.proteome: map\n",
+ "- xref: no independent ientifier namespace: annotation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bd81f3fc-68c3-4d28-97a6-aa12115f482a",
+ "metadata": {},
+ "source": [
+ "In the Bioregistry, some databases are not represented by a single flat prefix, but by a family of subtype-specific namespaces. \n",
+ "For example:\n",
+ "\n",
+ "- `panther.family`\n",
+ "- `panther.pathway`\n",
+ "- `panther.node`\n",
+ "\n",
+ "However, in UniProt data, the `db` field may simply contain:\n",
+ "without specifying which subtype is intended.\n",
+ "\n",
+ "To align with Bioregistry’s canonical model, we map such ambiguous database names to a chosen default or most commonly used subtype (e.g., `panther.family`). \n",
+ "This process is referred to as **\"collapsing subtype namespaces\"**, meaning we collapse a generalized database label into a specific canonical subtype namespace for governance consistency.\n",
+ "\n",
+ "---\n",
+ "\n",
+ "Not all `db` values in UniProt represent true identifier namespaces.\n",
+ "\n",
+ "Some entries function primarily as **annotation sources** rather than independent external identifier systems. In these cases:\n",
+ "\n",
+ "- The `xref` value often equals the UniProt accession itself.\n",
+ "- No independent external identifier is introduced.\n",
+ "- The database acts as a metadata or annotation provider.\n",
+ "\n",
+ "Examples include:\n",
+ "- ExpressionAtlas\n",
+ "- FunCoup\n",
+ "- PhosphoSitePlus\n",
+ "- GlyGen\n",
+ "\n",
+ "Because these entries do not introduce external identifiers, they should not be treated as canonical identifier namespaces requiring prefix normalization. \n",
+ "Instead, they are classified as `annotation` in the governance model.\n",
+ "\n",
+ "This distinction prevents misclassifying annotation metadata as unresolved namespace gaps."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "id": "89abb5ff-a8da-4cbd-adfd-aa05bfe28d55",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## get unique prefixes\n",
+ "\n",
+ "distinct_prefixes = df.select(\"db\").distinct()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "d6fea29e-a2ad-4c17-9550-590c3d445438",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## compute normalization locally\n",
+ "\n",
+ "prefix_list = [row.db for row in distinct_prefixes.collect()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "id": "3404b8ba-58df-4d16-97bc-7e59f7f8dbe9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "rows = []\n",
+ "\n",
+ "for db_value in prefix_list:\n",
+ " result = normalize_prefix(db_value, registry_set)\n",
+ "\n",
+ " rows.append((db_value, result[\"normalized\"], result[\"category\"], result[\"is_registry_gap\"]))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "id": "76c95f31-06ef-492d-9d61-218cb60214cd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dataframe = spark.createDataFrame(rows, [\"db\", \"db_normalized\", \"prefix_category\", \"is_registry_gap\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "b86dcef1-856f-4bcc-b032-b1aa63a03cf5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from pyspark.sql.functions import broadcast"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "9b4d9bbe-2ccb-4820-b48d-0e8b87b0c11a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_transformed = df.join(broadcast(dataframe), on=\"db\", how=\"left\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "64c8067a-b398-414d-abcd-eb5f9398072a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Remove annotation-only sources\n",
+ "df_transformed = df_transformed.filter(col(\"prefix_category\") != \"annotation\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "id": "7cd22e3f-b45e-4ee9-b630-2637cf650621",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+---------------+----------+\n",
+ "|prefix_category| count|\n",
+ "+---------------+----------+\n",
+ "| map| 500297808|\n",
+ "| exact|3240702519|\n",
+ "| registry_gap| 551499509|\n",
+ "| synonym| 45285911|\n",
+ "+---------------+----------+\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "df_transformed.groupBy(\"prefix_category\").count().show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "id": "5113dc5c-9dfa-4fe3-b97a-d4a2bf3dd160",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+-------------------+-------------------+\n",
+ "|db |db_normalized |\n",
+ "+-------------------+-------------------+\n",
+ "|NIAGADS |niagads |\n",
+ "|OpenTargets |opentargets |\n",
+ "|FunFam |funfam |\n",
+ "|Gene3D |gene3d |\n",
+ "|DNASU |dnasu |\n",
+ "|ProMEX |promex |\n",
+ "|ESTHER |esther |\n",
+ "|ClinPGx |clinpgx |\n",
+ "|CarbonylDB |carbonyldb |\n",
+ "|PHI-base |phi-base |\n",
+ "|AGR |agr |\n",
+ "|EnsemblProtists |ensemblprotists |\n",
+ "|Antibodypedia |antibodypedia |\n",
+ "|PATRIC |patric |\n",
+ "|SignaLink |signalink |\n",
+ "|CARD |card |\n",
+ "|euHCVdb |euhcvdb |\n",
+ "|EnsemblFungi |ensemblfungi |\n",
+ "|Bgee |bgee |\n",
+ "|ChiTaRS |chitars |\n",
+ "|DisGeNET |disgenet |\n",
+ "|BioGRID-ORCS |biogrid-orcs |\n",
+ "|WBParaSite |wbparasite |\n",
+ "|GeneCards |genecards |\n",
+ "|SABIO-RK |sabio-rk |\n",
+ "|NCBIfam |ncbifam |\n",
+ "|SFLD |sfld |\n",
+ "|VEuPathDB |veupathdb |\n",
+ "|AlphaFoldDB |alphafolddb |\n",
+ "|BioMuta |biomuta |\n",
+ "|CD-CODE |cd-code |\n",
+ "|EvolutionaryTrace |evolutionarytrace |\n",
+ "|TAIR |tair |\n",
+ "|PlantReactome |plantreactome |\n",
+ "|Leproma |leproma |\n",
+ "|PCDDB |pcddb |\n",
+ "|PseudoCAP |pseudocap |\n",
+ "|MalaCards |malacards |\n",
+ "|BMRB |bmrb |\n",
+ "|MoonProt |moonprot |\n",
+ "|JaponicusDB |japonicusdb |\n",
+ "|jPOST |jpost |\n",
+ "|LegioList |legiolist |\n",
+ "|CollecTF |collectf |\n",
+ "|UniLectin |unilectin |\n",
+ "|STRENDA-DB |strenda-db |\n",
+ "|REPRODUCTION-2DPAGE|reproduction-2dpage|\n",
+ "|RNAct |rnact |\n",
+ "|GlyConnect |glyconnect |\n",
+ "|SwissLipids |swisslipids |\n",
+ "+-------------------+-------------------+\n",
+ "only showing top 50 rows\n"
+ ]
+ }
+ ],
+ "source": [
+ "## Registry gap prefixes\n",
+ "\n",
+ "df_transformed.filter(col(\"is_registry_gap\") == True).select(\"db\", \"db_normalized\").distinct().show(50, truncate=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "id": "1f98810a-1177-47f8-875b-38d11b6fd0c7",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+---------------+--------------------+\n",
+ "|db |db_normalized |\n",
+ "+---------------+--------------------+\n",
+ "|PaxDb |paxdb.protein |\n",
+ "|EnsemblBacteria|ensembl |\n",
+ "|EnsemblPlants |ensembl |\n",
+ "|PIR |pirsf |\n",
+ "|OMA |oma.protein |\n",
+ "|EnsemblMetazoa |ensembl |\n",
+ "|MEROPS |merops.entry |\n",
+ "|PRO |pr |\n",
+ "|Proteomes |uniprot.proteome |\n",
+ "|ProteomicsDB |proteomicsdb.protein|\n",
+ "|PANTHER |panther.family |\n",
+ "|PeptideAtlas |peptideatlas.peptide|\n",
+ "+---------------+--------------------+\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "## Mapped Prefixes\n",
+ "\n",
+ "df_transformed.filter(col(\"prefix_category\") == \"map\").select(\"db\", \"db_normalized\").distinct().show(50, truncate=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "id": "0531cc4d-f091-4252-89d8-99d6414ad7d2",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+-----------+------------------+--------------------+--------------------+------------------+--------------+--------------------+----------------+---------------+---------------+\n",
+ "| db| entity_id| xref| description| _dlt_load_id| _dlt_id| relationship| db_normalized|prefix_category|is_registry_gap|\n",
+ "+-----------+------------------+--------------------+--------------------+------------------+--------------+--------------------+----------------+---------------+---------------+\n",
+ "| PRINTS|uniprot:A0A068QWH9| PR00368| NULL|1770728436.7741342|drstc13RmvdHag| NULL| prints| exact| false|\n",
+ "| PRINTS|uniprot:A0A068QWH9| PR00411| NULL|1770728436.7741342|MPVeMCDjxAJ89Q| NULL| prints| exact| false|\n",
+ "| SUPFAM|uniprot:A0A068QWH9| SSF51905| NULL|1770728436.7741342|VREQxAb6fbK+BQ| NULL| supfam| exact| false|\n",
+ "| SUPFAM|uniprot:A0A068QWH9| SSF55424| NULL|1770728436.7741342|ekRrV/FUJ73c2Q| NULL| supfam| exact| false|\n",
+ "| PROSITE|uniprot:A0A068QWH9| PS00076| NULL|1770728436.7741342|kuBN643V/sWyng| NULL| prosite| exact| false|\n",
+ "| NCBITaxon|uniprot:A0A068QWH9| 351671|UniProt taxon des...|1770728436.7741342|j9SFXXE0eB6ZvA|RO:0002162: in taxon| ncbitaxon| exact| false|\n",
+ "| UniProt|uniprot:A0A068QWV2| A0A068QWV2| UniProt accession|1770728436.7741342|b7ZowyA/KoIYQQ| NULL| uniprot| exact| false|\n",
+ "| EC|uniprot:A0A068QWV2| 3.1.4.3| NULL|1770728436.7741342|3wChU8GHa16jeA| NULL| ec| exact| false|\n",
+ "| genbank|uniprot:A0A068QWV2| FO704550|EMBL/GenBank Geno...|1770728436.7741342|zjDC0fJ2KO9n0A| NULL| genbank| exact| false|\n",
+ "| genbank|uniprot:A0A068QWV2| CDG19458.1|EMBL/GenBank prot...|1770728436.7741342|OMeDsMUhyKdOMA| NULL| genbank| exact| false|\n",
+ "| genbank|uniprot:A0A068QWV2| VNHN01000033|EMBL/GenBank Geno...|1770728436.7741342|M+JfLmKYi0qx+w| NULL| genbank| exact| false|\n",
+ "| genbank|uniprot:A0A068QWV2| TYP04735.1|EMBL/GenBank prot...|1770728436.7741342|tc7JJLaODr5shg| NULL| genbank| exact| false|\n",
+ "| refseq|uniprot:A0A068QWV2| WP_045973118.1|RefSeq protein se...|1770728436.7741342|Er6xoFDcSkd6IA| NULL| refseq| exact| false|\n",
+ "| refseq|uniprot:A0A068QWV2|NZ_CAWMED010000001.1|RefSeq nucleotide...|1770728436.7741342|NGjNrXxjewR4oQ| NULL| refseq| exact| false|\n",
+ "|AlphaFoldDB|uniprot:A0A068QWV2| A0A068QWV2| NULL|1770728436.7741342|Vc53XDEXlJNNvQ| NULL| alphafolddb| registry_gap| true|\n",
+ "| STRING|uniprot:A0A068QWV2| 351671.XDD1_3773| NULL|1770728436.7741342|dfoCsiPNh1BYGQ| NULL| string| exact| false|\n",
+ "| KEGG|uniprot:A0A068QWV2| xdo:XDD1_3773| NULL|1770728436.7741342|J1bvJ9Mtyd8N7Q| NULL| kegg| exact| false|\n",
+ "| HOGENOM|uniprot:A0A068QWV2| CLU_008770_1_0_6| NULL|1770728436.7741342|B2REs2T67kzU+g| NULL| hogenom| exact| false|\n",
+ "| OrthoDB|uniprot:A0A068QWV2| 9770871at2| NULL|1770728436.7741342|AApvMZ8ZATinIQ| NULL| orthodb| exact| false|\n",
+ "| Proteomes|uniprot:A0A068QWV2| UP000032721| NULL|1770728436.7741342|ji7YXcL+k6ld2A| NULL|uniprot.proteome| map| false|\n",
+ "+-----------+------------------+--------------------+--------------------+------------------+--------------+--------------------+----------------+---------------+---------------+\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "df_transformed.limit(20).show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6f57f7b4-8540-455b-b7db-dac3bb23ad6a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Transformation complete.\n"
+ ]
+ }
+ ],
+ "source": [
+ "OUTPUT_PATH = \"uniprot_prefix_investigation/data\"\n",
+ "\n",
+ "df_transformed.write.mode(\"overwrite\").parquet(OUTPUT_PATH)\n",
+ "print(\"Transformation complete.\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "id": "d12a4fdf-d065-4df2-990d-8a3d6b0bd146",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+---------------+----------+\n",
+ "|prefix_category| count|\n",
+ "+---------------+----------+\n",
+ "| map| 500297808|\n",
+ "| exact|3240702519|\n",
+ "| registry_gap| 551499509|\n",
+ "| synonym| 45285911|\n",
+ "+---------------+----------+\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "df_transformed.groupBy(\"prefix_category\").count().show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "id": "8a519d75-9a37-4219-952a-42c854ed59ce",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+-------------------+-------------------+\n",
+ "|db |db_normalized |\n",
+ "+-------------------+-------------------+\n",
+ "|NIAGADS |niagads |\n",
+ "|OpenTargets |opentargets |\n",
+ "|FunFam |funfam |\n",
+ "|Gene3D |gene3d |\n",
+ "|DNASU |dnasu |\n",
+ "|ProMEX |promex |\n",
+ "|ESTHER |esther |\n",
+ "|ClinPGx |clinpgx |\n",
+ "|PHI-base |phi-base |\n",
+ "|AGR |agr |\n",
+ "|EnsemblProtists |ensemblprotists |\n",
+ "|Antibodypedia |antibodypedia |\n",
+ "|PATRIC |patric |\n",
+ "|SignaLink |signalink |\n",
+ "|CARD |card |\n",
+ "|EnsemblFungi |ensemblfungi |\n",
+ "|TAIR |tair |\n",
+ "|Bgee |bgee |\n",
+ "|ChiTaRS |chitars |\n",
+ "|DisGeNET |disgenet |\n",
+ "|BioGRID-ORCS |biogrid-orcs |\n",
+ "|WBParaSite |wbparasite |\n",
+ "|GeneCards |genecards |\n",
+ "|NCBIfam |ncbifam |\n",
+ "|SFLD |sfld |\n",
+ "|VEuPathDB |veupathdb |\n",
+ "|AlphaFoldDB |alphafolddb |\n",
+ "|BioMuta |biomuta |\n",
+ "|CD-CODE |cd-code |\n",
+ "|EvolutionaryTrace |evolutionarytrace |\n",
+ "|BMRB |bmrb |\n",
+ "|MoonProt |moonprot |\n",
+ "|euHCVdb |euhcvdb |\n",
+ "|SABIO-RK |sabio-rk |\n",
+ "|STRENDA-DB |strenda-db |\n",
+ "|CollecTF |collectf |\n",
+ "|UniLectin |unilectin |\n",
+ "|LegioList |legiolist |\n",
+ "|jPOST |jpost |\n",
+ "|RNAct |rnact |\n",
+ "|GlyConnect |glyconnect |\n",
+ "|PlantReactome |plantreactome |\n",
+ "|CarbonylDB |carbonyldb |\n",
+ "|PCDDB |pcddb |\n",
+ "|REPRODUCTION-2DPAGE|reproduction-2dpage|\n",
+ "|Leproma |leproma |\n",
+ "|PseudoCAP |pseudocap |\n",
+ "|JaponicusDB |japonicusdb |\n",
+ "|TubercuList |tuberculist |\n",
+ "|PAN-GO |pan-go |\n",
+ "+-------------------+-------------------+\n",
+ "only showing top 50 rows\n"
+ ]
+ }
+ ],
+ "source": [
+ "df_transformed.filter(col(\"is_registry_gap\") == True).select(\"db\", \"db_normalized\").distinct().show(50, truncate=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "id": "811b2725-f60c-4962-aa76-1299fb4b97e4",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+-----------+------------------+----------------------+-----------+------------------+--------------+------------+-------------+---------------+---------------+\n",
+ "|db |entity_id |xref |description|_dlt_load_id |_dlt_id |relationship|db_normalized|prefix_category|is_registry_gap|\n",
+ "+-----------+------------------+----------------------+-----------+------------------+--------------+------------+-------------+---------------+---------------+\n",
+ "|AlphaFoldDB|uniprot:A0A068QWV2|A0A068QWV2 |NULL |1770728436.7741342|Vc53XDEXlJNNvQ|NULL |alphafolddb |registry_gap |true |\n",
+ "|Gene3D |uniprot:A0A068QWV2|3.40.720.10 |NULL |1770728436.7741342|VK3C8++f3UXukw|NULL |gene3d |registry_gap |true |\n",
+ "|NCBIfam |uniprot:A0A068QWV2|TIGR03396 |NULL |1770728436.7741342|fgF+NG8pQm3Kmw|NULL |ncbifam |registry_gap |true |\n",
+ "|AlphaFoldDB|uniprot:A0A0H3J6T1|A0A0H3J6T1 |NULL |1770728436.7741342|DHNHWLuCfvBaZg|NULL |alphafolddb |registry_gap |true |\n",
+ "|PATRIC |uniprot:A0A0H3J6T1|fig|1262449.7.peg.3138|NULL |1770728436.7741342|uyyASwoMoKBqZw|NULL |patric |registry_gap |true |\n",
+ "|AlphaFoldDB|uniprot:A0A1I0A2X9|A0A1I0A2X9 |NULL |1770728436.7741342|mPTa3bx78Q+tDA|NULL |alphafolddb |registry_gap |true |\n",
+ "|NCBIfam |uniprot:A0A1I0A2X9|NF011666 |NULL |1770728436.7741342|j5/3+i0E0w/UXA|NULL |ncbifam |registry_gap |true |\n",
+ "|NCBIfam |uniprot:A0A1I0A2X9|NF011667 |NULL |1770728436.7741342|NBILb/GZT7oIVw|NULL |ncbifam |registry_gap |true |\n",
+ "|AlphaFoldDB|uniprot:A0A1I7SRR9|A0A1I7SRR9 |NULL |1770728436.7741342|nAKV3EP5pyG6+w|NULL |alphafolddb |registry_gap |true |\n",
+ "|WBParaSite |uniprot:A0A1I7SRR9|BXY_1573600.1 |NULL |1770728436.7741342|RLX8mIM/aOY9iw|NULL |wbparasite |registry_gap |true |\n",
+ "+-----------+------------------+----------------------+-----------+------------------+--------------+------------+-------------+---------------+---------------+\n",
+ "only showing top 10 rows\n"
+ ]
+ }
+ ],
+ "source": [
+ "df_transformed.filter(col(\"is_registry_gap\") == True).show(10, truncate=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1947031b-1cd3-4b74-b980-279ad877379e",
+ "metadata": {},
+ "source": [
+ "## Overall Classification Summary\n",
+ "\n",
+ "After applying prefix normalization to the UniProt identifier parquet dataset, the prefixes were categorized as follows:\n",
+ "\n",
+ "| Category | Count |\n",
+ "|----------------|--------|\n",
+ "| exact | 192,555 |\n",
+ "| map | 31,059 |\n",
+ "| synonym | 3,118 |\n",
+ "| registry_gap | 28,089 |\n",
+ "\n",
+ "### Key Observations\n",
+ "\n",
+ "- The majority of prefixes are successfully aligned with canonical BioRegistry namespaces.\n",
+ "- Approximately **28,089 rows** fall into the `registry_gap` category.\n",
+ "- No unresolved \"unknown\" prefixes remain, indicating full classification coverage under the current normalization rules.\n",
+ "\n",
+ "---\n",
+ "\n",
+ "The prefix is now:\n",
+ "\n",
+ "- Deterministic\n",
+ "- Fully classified\n",
+ "- Reproducible\n",
+ "- Compatible with Spark transformation\n",
+ "- Transparent about registry gaps"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e8126a2e-b8ac-41c9-b436-153855e523e2",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv312",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/uniprot_prefix_investigation/uniprot_prefixes.ipynb b/notebooks/uniprot_prefix_investigation/uniprot_prefixes.ipynb
new file mode 100644
index 0000000..77244ca
--- /dev/null
+++ b/notebooks/uniprot_prefix_investigation/uniprot_prefixes.ipynb
@@ -0,0 +1,3792 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "40bd11b6-ca4f-419a-8a64-6fbabf73bf0d",
+ "metadata": {},
+ "source": [
+ "# Load UniProt official registry"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "cd691fac",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from collections import defaultdict\n",
+ "from pathlib import Path\n",
+ "\n",
+ "import bioregistry as br\n",
+ "import requests"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "c70b72bc",
+ "metadata": {
+ "collapsed": true,
+ "jupyter": {
+ "outputs_hidden": true
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[{'name': 'ABCD curated depository of sequenced antibodies',\n",
+ " 'id': 'DB-0236',\n",
+ " 'abbrev': 'ABCD',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://web.expasy.org/abcd'],\n",
+ " 'dbUrl': 'https://web.expasy.org/cgi-bin/abcd/search_abcd.pl?input=%u',\n",
+ " 'category': 'Protocols and materials databases',\n",
+ " 'statistics': {'reviewedProteinCount': 3196, 'unreviewedProteinCount': 619}},\n",
+ " {'name': 'The Alliance of Genome Resources',\n",
+ " 'id': 'DB-0266',\n",
+ " 'abbrev': 'AGR',\n",
+ " 'pubMedId': '31552413',\n",
+ " 'doiId': '10.1093/nar/gkz813',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.alliancegenome.org/'],\n",
+ " 'dbUrl': 'https://www.alliancegenome.org/gene/%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 68617,\n",
+ " 'unreviewedProteinCount': 246243}},\n",
+ " {'name': 'Agora',\n",
+ " 'id': 'DB-0283',\n",
+ " 'abbrev': 'Agora',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://agora.adknowledgeportal.org'],\n",
+ " 'dbUrl': 'https://agora.adknowledgeportal.org/genes/%s/',\n",
+ " 'category': 'Miscellaneous databases',\n",
+ " 'statistics': {'reviewedProteinCount': 18413, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'Allergome; a platform for allergen knowledge',\n",
+ " 'id': 'DB-0160',\n",
+ " 'abbrev': 'Allergome',\n",
+ " 'pubMedId': '19671381',\n",
+ " 'doiId': '10.1007/s11882-009-0055-9',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.allergome.org/'],\n",
+ " 'dbUrl': 'https://www.allergome.org/script/dettaglio.php?id_molecule=%s',\n",
+ " 'category': 'Protein family/group databases',\n",
+ " 'statistics': {'reviewedProteinCount': 1316,\n",
+ " 'unreviewedProteinCount': 3081}},\n",
+ " {'name': 'AlphaFold Protein Structure Database',\n",
+ " 'id': 'DB-0262',\n",
+ " 'abbrev': 'AlphaFoldDB',\n",
+ " 'pubMedId': '37933859',\n",
+ " 'doiId': '10.1093/nar/gkad1011',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://alphafold.ebi.ac.uk/',\n",
+ " 'https://deepmind.com/research/open-source/alphafold-protein-structure-database'],\n",
+ " 'dbUrl': 'https://alphafold.ebi.ac.uk/search/text/%u',\n",
+ " 'category': '3D structure databases',\n",
+ " 'statistics': {'reviewedProteinCount': 549408,\n",
+ " 'unreviewedProteinCount': 158049481}},\n",
+ " {'name': 'AntiFam resource to identify spurious protein predictions',\n",
+ " 'id': 'DB-0275',\n",
+ " 'abbrev': 'AntiFam',\n",
+ " 'pubMedId': '22434837',\n",
+ " 'doiId': '10.1093/database/bas003',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.ebi.ac.uk/interpro/entry/antifam'],\n",
+ " 'dbUrl': 'https://www.ebi.ac.uk/interpro/entry/antifam/%s',\n",
+ " 'category': 'Family and domain databases',\n",
+ " 'statistics': {'reviewedProteinCount': 22, 'unreviewedProteinCount': 26090}},\n",
+ " {'name': 'Antibodypedia a portal for validated antibodies',\n",
+ " 'id': 'DB-0249',\n",
+ " 'abbrev': 'Antibodypedia',\n",
+ " 'pubMedId': '18667413',\n",
+ " 'doiId': '10.1074/mcp.M800264-MCP200',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.antibodypedia.com/'],\n",
+ " 'dbUrl': 'https://www.antibodypedia.com/gene/%s',\n",
+ " 'category': 'Protocols and materials databases',\n",
+ " 'statistics': {'reviewedProteinCount': 32245,\n",
+ " 'unreviewedProteinCount': 71628}},\n",
+ " {'name': 'ArachnoServer',\n",
+ " 'id': 'DB-0145',\n",
+ " 'abbrev': 'ArachnoServer',\n",
+ " 'pubMedId': '29069336',\n",
+ " 'doiId': '10.1093/bioinformatics/btx661',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['http://www.arachnoserver.org'],\n",
+ " 'dbUrl': 'http://www.arachnoserver.org',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 1138, 'unreviewedProteinCount': 195}},\n",
+ " {'name': 'Arabidopsis Information Portal',\n",
+ " 'id': 'DB-0221',\n",
+ " 'abbrev': 'Araport',\n",
+ " 'pubMedId': '25414324',\n",
+ " 'doiId': '10.1093/nar/gku1200',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://bar.utoronto.ca/thalemine/begin.do',\n",
+ " 'https://www.araport.org/'],\n",
+ " 'dbUrl': 'https://bar.utoronto.ca/thalemine/portal.do?externalids=%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 16342,\n",
+ " 'unreviewedProteinCount': 23420}},\n",
+ " {'name': 'Biological Magnetic Resonance Data Bank',\n",
+ " 'id': 'DB-0256',\n",
+ " 'abbrev': 'BMRB',\n",
+ " 'pubMedId': '36478084',\n",
+ " 'doiId': '10.1093/nar/gkac1050',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://bmrb.io/'],\n",
+ " 'dbUrl': 'https://bmrb.io/data_library/summary/protein.php?uniprot=%u',\n",
+ " 'category': '3D structure databases',\n",
+ " 'statistics': {'reviewedProteinCount': 6914, 'unreviewedProteinCount': 279}},\n",
+ " {'name': 'BRENDA Comprehensive Enzyme Information System',\n",
+ " 'id': 'DB-0131',\n",
+ " 'abbrev': 'BRENDA',\n",
+ " 'pubMedId': '33211880',\n",
+ " 'doiId': '10.1093/nar/gkaa1025',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.brenda-enzymes.org'],\n",
+ " 'dbUrl': 'https://www.brenda-enzymes.org/enzyme.php?ecno=%s&UniProtAcc=%u&OrganismID=%d',\n",
+ " 'category': 'Enzyme and pathway databases',\n",
+ " 'statistics': {'reviewedProteinCount': 18683,\n",
+ " 'unreviewedProteinCount': 16399}},\n",
+ " {'name': 'Bgee dataBase for Gene Expression Evolution',\n",
+ " 'id': 'DB-0133',\n",
+ " 'abbrev': 'Bgee',\n",
+ " 'pubMedId': '33037820',\n",
+ " 'doiId': '10.1093/nar/gkaa793',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.bgee.org'],\n",
+ " 'dbUrl': 'https://www.bgee.org/gene/%s',\n",
+ " 'category': 'Gene expression databases',\n",
+ " 'statistics': {'reviewedProteinCount': 61984,\n",
+ " 'unreviewedProteinCount': 1245174}},\n",
+ " {'name': 'BindingDB database of measured binding affinities',\n",
+ " 'id': 'DB-0127',\n",
+ " 'abbrev': 'BindingDB',\n",
+ " 'pubMedId': '39574417',\n",
+ " 'doiId': '10.1093/nar/gkae1075',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.bindingdb.org/'],\n",
+ " 'dbUrl': 'https://www.bindingdb.org/rwd/uniprot/%u',\n",
+ " 'category': 'Chemistry databases',\n",
+ " 'statistics': {'reviewedProteinCount': 6929, 'unreviewedProteinCount': 919}},\n",
+ " {'name': 'BioCyc Collection of Pathway/Genome Databases',\n",
+ " 'id': 'DB-0005',\n",
+ " 'abbrev': 'BioCyc',\n",
+ " 'pubMedId': '29447345',\n",
+ " 'doiId': '10.1093/bib/bbx085',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.biocyc.org/'],\n",
+ " 'dbUrl': 'https://biocyc.org/getid?id=%s',\n",
+ " 'category': 'Enzyme and pathway databases',\n",
+ " 'statistics': {'reviewedProteinCount': 44210,\n",
+ " 'unreviewedProteinCount': 268710}},\n",
+ " {'name': 'The Biological General Repository for Interaction Datasets (BioGRID)',\n",
+ " 'id': 'DB-0184',\n",
+ " 'abbrev': 'BioGRID',\n",
+ " 'pubMedId': '33070389',\n",
+ " 'doiId': '10.1002/pro.3978',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://thebiogrid.org/'],\n",
+ " 'dbUrl': 'https://thebiogrid.org/%s',\n",
+ " 'category': 'Protein-protein interaction databases',\n",
+ " 'statistics': {'reviewedProteinCount': 60445, 'unreviewedProteinCount': 1}},\n",
+ " {'name': 'BioGRID ORCS database of CRISPR phenotype screens',\n",
+ " 'id': 'DB-0252',\n",
+ " 'abbrev': 'BioGRID-ORCS',\n",
+ " 'pubMedId': '30476227',\n",
+ " 'doiId': '10.1093/nar/gky1079',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://orcs.thebiogrid.org'],\n",
+ " 'dbUrl': 'https://orcs.thebiogrid.org/Gene/%s',\n",
+ " 'category': 'Miscellaneous databases',\n",
+ " 'statistics': {'reviewedProteinCount': 44557,\n",
+ " 'unreviewedProteinCount': 56284}},\n",
+ " {'name': 'BioMuta curated single-nucleotide variation and disease association database',\n",
+ " 'id': 'DB-0192',\n",
+ " 'abbrev': 'BioMuta',\n",
+ " 'pubMedId': '30053270',\n",
+ " 'doiId': '10.1093/nar/gkx907',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://hivelab.biochemistry.gwu.edu/biomuta'],\n",
+ " 'dbUrl': 'https://hive.biochemistry.gwu.edu/tools/biomuta/biomuta.php?gene=%s',\n",
+ " 'category': 'Genetic variation databases',\n",
+ " 'statistics': {'reviewedProteinCount': 20259,\n",
+ " 'unreviewedProteinCount': 889}},\n",
+ " {'name': 'The Comprehensive Antibiotic Resistance Database',\n",
+ " 'id': 'DB-0282',\n",
+ " 'abbrev': 'CARD',\n",
+ " 'pubMedId': '36263822',\n",
+ " 'doiId': '10.1093/nar/gkac920',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://card.mcmaster.ca'],\n",
+ " 'dbUrl': 'https://card.mcmaster.ca/%s',\n",
+ " 'category': 'Protein family/group databases',\n",
+ " 'statistics': {'reviewedProteinCount': 319, 'unreviewedProteinCount': 3851}},\n",
+ " {'name': 'Carbohydrate-Active enZymes',\n",
+ " 'id': 'DB-0136',\n",
+ " 'abbrev': 'CAZy',\n",
+ " 'pubMedId': '34850161',\n",
+ " 'doiId': '10.1093/nar/gkab1045',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.cazy.org/'],\n",
+ " 'dbUrl': 'https://www.cazy.org/%s.html',\n",
+ " 'category': 'Protein family/group databases',\n",
+ " 'statistics': {'reviewedProteinCount': 8755,\n",
+ " 'unreviewedProteinCount': 108751}},\n",
+ " {'name': 'The Consensus CDS (CCDS) project',\n",
+ " 'id': 'DB-0187',\n",
+ " 'abbrev': 'CCDS',\n",
+ " 'pubMedId': '29126148',\n",
+ " 'doiId': '10.1093/nar/gkx1031',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.ncbi.nlm.nih.gov/projects/CCDS/CcdsBrowse.cgi'],\n",
+ " 'dbUrl': 'https://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&GO=MainBrowse&DATA=%s',\n",
+ " 'category': 'Sequence databases',\n",
+ " 'statistics': {'reviewedProteinCount': 34842, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'CrowDsourcing COndensate Database and Encyclopedia',\n",
+ " 'id': 'DB-0277',\n",
+ " 'abbrev': 'CD-CODE',\n",
+ " 'pubMedId': '37024650',\n",
+ " 'doiId': '10.1038/s41592-023-01831-0',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://cd-code.org/'],\n",
+ " 'dbUrl': 'https://cd-code.org/condensate/%s',\n",
+ " 'category': 'Miscellaneous databases',\n",
+ " 'statistics': {'reviewedProteinCount': 8219,\n",
+ " 'unreviewedProteinCount': 1453}},\n",
+ " {'name': 'Conserved Domains Database',\n",
+ " 'id': 'DB-0214',\n",
+ " 'abbrev': 'CDD',\n",
+ " 'pubMedId': '36477806',\n",
+ " 'doiId': '10.1093/nar/gkac1096',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.ncbi.nlm.nih.gov/cdd'],\n",
+ " 'dbUrl': 'https://www.ncbi.nlm.nih.gov/Structure/cdd/cddsrv.cgi?uid=%s',\n",
+ " 'category': 'Family and domain databases',\n",
+ " 'statistics': {'reviewedProteinCount': 309774,\n",
+ " 'unreviewedProteinCount': 67075747}},\n",
+ " {'name': 'Candida Genome Database',\n",
+ " 'id': 'DB-0126',\n",
+ " 'abbrev': 'CGD',\n",
+ " 'pubMedId': '27738138',\n",
+ " 'doiId': '10.1093/nar/gkw924',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['http://www.candidagenome.org/'],\n",
+ " 'dbUrl': 'http://www.candidagenome.org/cgi-bin/locus.pl?dbid=%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 2143,\n",
+ " 'unreviewedProteinCount': 20526}},\n",
+ " {'name': 'CIViC Clinical Interpretation of Variants in Cancer',\n",
+ " 'id': 'DB-0284',\n",
+ " 'abbrev': 'CIViC',\n",
+ " 'pubMedId': '28138153',\n",
+ " 'doiId': '110.1038/ng.3774',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://civicdb.org'],\n",
+ " 'dbUrl': 'https://civicdb.org/links/entrez_id/%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 568, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'CORUM comprehensive resource of mammalian protein complexes',\n",
+ " 'id': 'DB-0224',\n",
+ " 'abbrev': 'CORUM',\n",
+ " 'pubMedId': '36382402',\n",
+ " 'doiId': '10.1093/nar/gkac1015',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://mips.helmholtz-muenchen.de/corum/'],\n",
+ " 'dbUrl': 'https://mips.helmholtz-muenchen.de/corum/#?uniprotID=%u',\n",
+ " 'category': 'Protein-protein interaction databases',\n",
+ " 'statistics': {'reviewedProteinCount': 8090, 'unreviewedProteinCount': 349}},\n",
+ " {'name': 'The CPTAC Assay portal',\n",
+ " 'id': 'DB-0238',\n",
+ " 'abbrev': 'CPTAC',\n",
+ " 'pubMedId': '24972168',\n",
+ " 'doiId': '10.1038/nmeth.3002',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://assays.cancer.gov/'],\n",
+ " 'dbUrl': 'https://assays.cancer.gov/%s',\n",
+ " 'category': 'Proteomic databases',\n",
+ " 'statistics': {'reviewedProteinCount': 1929, 'unreviewedProteinCount': 13}},\n",
+ " {'name': 'The CPTC Antibody Portal',\n",
+ " 'id': 'DB-0259',\n",
+ " 'abbrev': 'CPTC',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://antibodies.cancer.gov'],\n",
+ " 'dbUrl': 'https://antibodies.cancer.gov/uniprot/%u',\n",
+ " 'category': 'Protocols and materials databases',\n",
+ " 'statistics': {'reviewedProteinCount': 410, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'Comparative Toxicogenomics Database',\n",
+ " 'id': 'DB-0140',\n",
+ " 'abbrev': 'CTD',\n",
+ " 'pubMedId': '33068428',\n",
+ " 'doiId': '10.1093/nar/gkaa891',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://ctdbase.org/'],\n",
+ " 'dbUrl': 'https://ctdbase.org/detail.go?type=gene&db=GENE&acc=%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 77047,\n",
+ " 'unreviewedProteinCount': 3968516}},\n",
+ " {'name': 'CarbonylDB database of protein carbonylation sites',\n",
+ " 'id': 'DB-0225',\n",
+ " 'abbrev': 'CarbonylDB',\n",
+ " 'pubMedId': '29509874',\n",
+ " 'doiId': '10.1093/bioinformatics/bty123',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://carbonyldb.missouri.edu/CarbonylDB/index.php'],\n",
+ " 'dbUrl': 'https://carbonyldb.missouri.edu/CarbonylDB/index.php/detail/protein/%u',\n",
+ " 'category': 'PTM databases',\n",
+ " 'statistics': {'reviewedProteinCount': 1159, 'unreviewedProteinCount': 305}},\n",
+ " {'name': 'ChEMBL database of bioactive drug-like small molecules',\n",
+ " 'id': 'DB-0174',\n",
+ " 'abbrev': 'ChEMBL',\n",
+ " 'pubMedId': '37933841',\n",
+ " 'doiId': '10.1093/nar/gkad1004',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.ebi.ac.uk/chembl'],\n",
+ " 'dbUrl': 'https://www.ebi.ac.uk/chembl/target_report_card/%s',\n",
+ " 'category': 'Chemistry databases',\n",
+ " 'statistics': {'reviewedProteinCount': 9110,\n",
+ " 'unreviewedProteinCount': 1179}},\n",
+ " {'name': 'ChiTaRS',\n",
+ " 'id': 'DB-0176',\n",
+ " 'abbrev': 'ChiTaRS',\n",
+ " 'pubMedId': '39676654',\n",
+ " 'doiId': '10.1093/nar/gkae1126',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['http://biosrv.org/chmb/information'],\n",
+ " 'dbUrl': 'http://biosrv.org/chmb/search?GEN=%s',\n",
+ " 'category': 'Miscellaneous databases',\n",
+ " 'statistics': {'reviewedProteinCount': 15251,\n",
+ " 'unreviewedProteinCount': 107098}},\n",
+ " {'name': 'The ClinGen clinical relevance of genes and variants resource',\n",
+ " 'id': 'DB-0263',\n",
+ " 'abbrev': 'ClinGen',\n",
+ " 'pubMedId': '26014595',\n",
+ " 'doiId': '10.1056/NEJMsr1406261',\n",
+ " 'linkType': 'Implicit',\n",
+ " 'servers': ['https://www.clinicalgenome.org/'],\n",
+ " 'dbUrl': 'https://search.clinicalgenome.org/kb/genes/%s',\n",
+ " 'category': 'Genetic variation databases',\n",
+ " 'statistics': {'reviewedProteinCount': 0, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'The Clinical Pharmacogenomics Resource',\n",
+ " 'id': 'DB-0285',\n",
+ " 'abbrev': 'ClinPGx',\n",
+ " 'pubMedId': '34216021',\n",
+ " 'doiId': '10.1002/cpt.2350',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.clinpgx.org'],\n",
+ " 'dbUrl': 'https://www.clinpgx.org/gene/%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 18009,\n",
+ " 'unreviewedProteinCount': 3064}},\n",
+ " {'name': 'CollecTF database of bacterial transcription factor binding sites',\n",
+ " 'id': 'DB-0198',\n",
+ " 'abbrev': 'CollecTF',\n",
+ " 'pubMedId': '24234444',\n",
+ " 'doiId': '10.1093/nar/gkt1087',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['http://www.collectf.org/'],\n",
+ " 'dbUrl': 'http://www.collectf.org/%s',\n",
+ " 'category': 'Gene expression databases',\n",
+ " 'statistics': {'reviewedProteinCount': 138, 'unreviewedProteinCount': 163}},\n",
+ " {'name': 'ComplexPortal',\n",
+ " 'id': 'DB-0228',\n",
+ " 'abbrev': 'ComplexPortal',\n",
+ " 'pubMedId': '30357405',\n",
+ " 'doiId': '10.1093/nar/gky1001',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.ebi.ac.uk/complexportal/'],\n",
+ " 'dbUrl': 'https://www.ebi.ac.uk/complexportal/complex/%s',\n",
+ " 'category': 'Protein-protein interaction databases',\n",
+ " 'statistics': {'reviewedProteinCount': 9733, 'unreviewedProteinCount': 716}},\n",
+ " {'name': 'ConoServer',\n",
+ " 'id': 'DB-0156',\n",
+ " 'abbrev': 'ConoServer',\n",
+ " 'pubMedId': '22058133',\n",
+ " 'doiId': '10.1093/nar/gkr886',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.conoserver.org/'],\n",
+ " 'dbUrl': 'https://www.conoserver.org/?page=card&table=protein&id=%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 879, 'unreviewedProteinCount': 157}},\n",
+ " {'name': 'DNA Data Bank of Japan; a nucleotide sequence database',\n",
+ " 'id': 'DB-0014',\n",
+ " 'abbrev': 'DDBJ',\n",
+ " 'pubMedId': '36420889',\n",
+ " 'doiId': '10.1093/nar/gkac1083',\n",
+ " 'linkType': 'Implicit',\n",
+ " 'servers': ['https://www.ddbj.nig.ac.jp/index-e.html'],\n",
+ " 'dbUrl': 'https://getentry.ddbj.nig.ac.jp/getentry/na/%s?filetype=html',\n",
+ " 'category': 'Sequence databases',\n",
+ " 'statistics': {'reviewedProteinCount': 0, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'DEPOD human dephosphorylation database',\n",
+ " 'id': 'DB-0190',\n",
+ " 'abbrev': 'DEPOD',\n",
+ " 'pubMedId': '31836896',\n",
+ " 'doiId': 'doi',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://depod.bioss.uni-freiburg.de'],\n",
+ " 'dbUrl': 'https://depod.bioss.uni-freiburg.de/showp.php?name=%s',\n",
+ " 'category': 'PTM databases',\n",
+ " 'statistics': {'reviewedProteinCount': 254, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'Database of interacting proteins',\n",
+ " 'id': 'DB-0016',\n",
+ " 'abbrev': 'DIP',\n",
+ " 'pubMedId': '14681454',\n",
+ " 'doiId': '10.1093/nar/gkh086',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://dip.doe-mbi.ucla.edu/'],\n",
+ " 'dbUrl': 'https://dip.doe-mbi.ucla.edu/dip/Browse.cgi?ID=%s',\n",
+ " 'category': 'Protein-protein interaction databases',\n",
+ " 'statistics': {'reviewedProteinCount': 17539,\n",
+ " 'unreviewedProteinCount': 2949}},\n",
+ " {'name': 'Domain mapping of disease mutations (DMDM)',\n",
+ " 'id': 'DB-0166',\n",
+ " 'abbrev': 'DMDM',\n",
+ " 'pubMedId': '20685956',\n",
+ " 'doiId': '10.1093/bioinformatics/btq447',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://bioinf.umbc.edu/dmdm/'],\n",
+ " 'dbUrl': 'https://bioinf.umbc.edu/dmdm/gene_prot_page.php?search_type=protein&id=%s',\n",
+ " 'category': 'Genetic variation databases',\n",
+ " 'statistics': {'reviewedProteinCount': 16163, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'The DNASU plasmid repository',\n",
+ " 'id': 'DB-0167',\n",
+ " 'abbrev': 'DNASU',\n",
+ " 'pubMedId': '24225319',\n",
+ " 'doiId': '10.1093/nar/gkt1060',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://dnasu.org/DNASU/'],\n",
+ " 'dbUrl': 'https://dnasu.org/DNASU/AdvancedSearchOptions.do?geneName=%s',\n",
+ " 'category': 'Protocols and materials databases',\n",
+ " 'statistics': {'reviewedProteinCount': 48477,\n",
+ " 'unreviewedProteinCount': 87491}},\n",
+ " {'name': 'DisGeNET',\n",
+ " 'id': 'DB-0218',\n",
+ " 'abbrev': 'DisGeNET',\n",
+ " 'pubMedId': '31680165',\n",
+ " 'doiId': '10.1093/nar/gkz1021',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.disgenet.com/'],\n",
+ " 'dbUrl': 'https://www.disgenet.com/search?view=GENES&idents=%s&source=ALL&tab=GDA',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 17415,\n",
+ " 'unreviewedProteinCount': 11508}},\n",
+ " {'name': 'Database of protein disorder',\n",
+ " 'id': 'DB-0017',\n",
+ " 'abbrev': 'DisProt',\n",
+ " 'pubMedId': '34850135',\n",
+ " 'doiId': '10.1093/nar/gkab1082',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://disprot.org'],\n",
+ " 'dbUrl': 'https://disprot.org/%s',\n",
+ " 'category': 'Family and domain databases',\n",
+ " 'statistics': {'reviewedProteinCount': 2800, 'unreviewedProteinCount': 370}},\n",
+ " {'name': 'Drug and drug target database',\n",
+ " 'id': 'DB-0019',\n",
+ " 'abbrev': 'DrugBank',\n",
+ " 'pubMedId': '37953279',\n",
+ " 'doiId': '10.1093/nar/gkad976',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://go.drugbank.com'],\n",
+ " 'dbUrl': 'https://go.drugbank.com/drugs/%s',\n",
+ " 'category': 'Chemistry databases',\n",
+ " 'statistics': {'reviewedProteinCount': 4939, 'unreviewedProteinCount': 406}},\n",
+ " {'name': 'DrugCentral',\n",
+ " 'id': 'DB-0239',\n",
+ " 'abbrev': 'DrugCentral',\n",
+ " 'pubMedId': '36484092',\n",
+ " 'doiId': '10.1093/nar/gkac1085',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://drugcentral.org/'],\n",
+ " 'dbUrl': 'https://drugcentral.org?q=%u',\n",
+ " 'category': 'Chemistry databases',\n",
+ " 'statistics': {'reviewedProteinCount': 2982, 'unreviewedProteinCount': 288}},\n",
+ " {'name': 'The Eukaryotic Linear Motif resource for Functional Sites in Proteins',\n",
+ " 'id': 'DB-0223',\n",
+ " 'abbrev': 'ELM',\n",
+ " 'pubMedId': '34718738',\n",
+ " 'doiId': '10.1093/nar/gkab975',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['http://elm.eu.org/'],\n",
+ " 'dbUrl': 'http://elm.eu.org/instances.html?q=%u',\n",
+ " 'category': 'Protein-protein interaction databases',\n",
+ " 'statistics': {'reviewedProteinCount': 1815, 'unreviewedProteinCount': 76}},\n",
+ " {'name': 'EMBL nucleotide sequence database',\n",
+ " 'id': 'DB-0022',\n",
+ " 'abbrev': 'EMBL',\n",
+ " 'pubMedId': '33175160',\n",
+ " 'doiId': '10.1093/nar/gkaa1028',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.ebi.ac.uk/ena/browser/home'],\n",
+ " 'dbUrl': 'https://www.ebi.ac.uk/ena/browser/view/%s',\n",
+ " 'category': 'Sequence databases',\n",
+ " 'statistics': {'reviewedProteinCount': 561609,\n",
+ " 'unreviewedProteinCount': 185502674}},\n",
+ " {'name': 'Electron Microscopy Data Bank',\n",
+ " 'id': 'DB-0272',\n",
+ " 'abbrev': 'EMDB',\n",
+ " 'pubMedId': '26578576',\n",
+ " 'doiId': '10.1093/nar/gkv1126',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.ebi.ac.uk/emdb'],\n",
+ " 'dbUrl': 'https://www.ebi.ac.uk/emdb/%s',\n",
+ " 'category': '3D structure databases',\n",
+ " 'statistics': {'reviewedProteinCount': 11682,\n",
+ " 'unreviewedProteinCount': 9896}},\n",
+ " {'name': 'Enzyme nomenclature database',\n",
+ " 'id': 'DB-0024',\n",
+ " 'abbrev': 'ENZYME',\n",
+ " 'pubMedId': '10592255',\n",
+ " 'doiId': '10.1093/nar/28.1.304',\n",
+ " 'linkType': 'Implicit',\n",
+ " 'servers': ['https://enzyme.expasy.org/'],\n",
+ " 'dbUrl': 'https://enzyme.expasy.org/EC/%s',\n",
+ " 'category': 'Enzyme and pathway databases',\n",
+ " 'statistics': {'reviewedProteinCount': 0, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'ESTHER database of the Alpha/Beta-hydrolase fold superfamily of proteins',\n",
+ " 'id': 'DB-0193',\n",
+ " 'abbrev': 'ESTHER',\n",
+ " 'pubMedId': '23193256',\n",
+ " 'doiId': '10.1093/nar/gks1154',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://bioweb.supagro.inra.fr/ESTHER/general?what=index'],\n",
+ " 'dbUrl': 'https://bioweb.supagro.inra.fr/ESTHER/gene_locus?name=%s&class=Gene_locus',\n",
+ " 'category': 'Protein family/group databases',\n",
+ " 'statistics': {'reviewedProteinCount': 3032,\n",
+ " 'unreviewedProteinCount': 68523}},\n",
+ " {'name': 'EchoBASE - an integrated post-genomic database for E. coli',\n",
+ " 'id': 'DB-0020',\n",
+ " 'abbrev': 'EchoBASE',\n",
+ " 'pubMedId': '15608209',\n",
+ " 'doiId': '10.1093/nar/gki028',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.york.ac.uk/res/thomas/'],\n",
+ " 'dbUrl': 'https://www.york.ac.uk/res/thomas/Gene.cfm?recordID=%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 4158, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'Ensembl eukaryotic genome annotation project',\n",
+ " 'id': 'DB-0023',\n",
+ " 'abbrev': 'Ensembl',\n",
+ " 'pubMedId': '31691826',\n",
+ " 'doiId': '10.1093/nar/gkz966',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.ensembl.org/'],\n",
+ " 'dbUrl': 'https://www.ensembl.org/id/%s',\n",
+ " 'category': 'Genome annotation databases',\n",
+ " 'statistics': {'reviewedProteinCount': 51864,\n",
+ " 'unreviewedProteinCount': 9071760}},\n",
+ " {'name': 'Ensembl bacterial and archaeal genome annotation project',\n",
+ " 'id': 'DB-0147',\n",
+ " 'abbrev': 'EnsemblBacteria',\n",
+ " 'pubMedId': '31598706',\n",
+ " 'doiId': '10.1093/nar/gkz890',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://bacteria.ensembl.org/'],\n",
+ " 'dbUrl': 'https://www.ensemblgenomes.org/id/%s',\n",
+ " 'category': 'Genome annotation databases',\n",
+ " 'statistics': {'reviewedProteinCount': 55400,\n",
+ " 'unreviewedProteinCount': 257504}},\n",
+ " {'name': 'Ensembl fungal genome annotation project',\n",
+ " 'id': 'DB-0148',\n",
+ " 'abbrev': 'EnsemblFungi',\n",
+ " 'pubMedId': '31598706',\n",
+ " 'doiId': '10.1093/nar/gkz890',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://fungi.ensembl.org/'],\n",
+ " 'dbUrl': 'https://www.ensemblgenomes.org/id/%s',\n",
+ " 'category': 'Genome annotation databases',\n",
+ " 'statistics': {'reviewedProteinCount': 19196,\n",
+ " 'unreviewedProteinCount': 85337}},\n",
+ " {'name': 'Ensembl metazoan genome annotation project',\n",
+ " 'id': 'DB-0149',\n",
+ " 'abbrev': 'EnsemblMetazoa',\n",
+ " 'pubMedId': '31598706',\n",
+ " 'doiId': '10.1093/nar/gkz890',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://metazoa.ensembl.org/'],\n",
+ " 'dbUrl': 'https://www.ensemblgenomes.org/id/%s',\n",
+ " 'category': 'Genome annotation databases',\n",
+ " 'statistics': {'reviewedProteinCount': 12907,\n",
+ " 'unreviewedProteinCount': 1190734}},\n",
+ " {'name': 'Ensembl plant genome annotation project',\n",
+ " 'id': 'DB-0150',\n",
+ " 'abbrev': 'EnsemblPlants',\n",
+ " 'pubMedId': '31598706',\n",
+ " 'doiId': '10.1093/nar/gkz890',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://plants.ensembl.org/'],\n",
+ " 'dbUrl': 'https://www.ensemblgenomes.org/id/%s',\n",
+ " 'category': 'Genome annotation databases',\n",
+ " 'statistics': {'reviewedProteinCount': 6359,\n",
+ " 'unreviewedProteinCount': 1642695}},\n",
+ " {'name': 'Ensembl protists genome annotation project',\n",
+ " 'id': 'DB-0151',\n",
+ " 'abbrev': 'EnsemblProtists',\n",
+ " 'pubMedId': '31598706',\n",
+ " 'doiId': '10.1093/nar/gkz890',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://protists.ensembl.org/'],\n",
+ " 'dbUrl': 'https://www.ensemblgenomes.org/id/%s',\n",
+ " 'category': 'Genome annotation databases',\n",
+ " 'statistics': {'reviewedProteinCount': 1593,\n",
+ " 'unreviewedProteinCount': 104680}},\n",
+ " {'name': 'Relative evolutionary importance of amino acids within a protein sequence',\n",
+ " 'id': 'DB-0168',\n",
+ " 'abbrev': 'EvolutionaryTrace',\n",
+ " 'pubMedId': '22183528',\n",
+ " 'doiId': '10.1007/978-1-61779-465-0_3',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://evolution.lichtargelab.org/ETviewer/'],\n",
+ " 'dbUrl': 'https://evolution.lichtargelab.org/cgi-bin/report_maker_ls/uniprotTraceServerResults.pl?identifier=%s',\n",
+ " 'category': 'Miscellaneous databases',\n",
+ " 'statistics': {'reviewedProteinCount': 22703,\n",
+ " 'unreviewedProteinCount': 10447}},\n",
+ " {'name': 'ExpressionAtlas, Differential and Baseline Expression',\n",
+ " 'id': 'DB-0004',\n",
+ " 'abbrev': 'ExpressionAtlas',\n",
+ " 'pubMedId': '31665515',\n",
+ " 'doiId': '10.1093/nar/gkz947',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.ebi.ac.uk/gxa'],\n",
+ " 'dbUrl': 'https://www.ebi.ac.uk/gxa/query?geneQuery=%u',\n",
+ " 'category': 'Gene expression databases',\n",
+ " 'statistics': {'reviewedProteinCount': 51312,\n",
+ " 'unreviewedProteinCount': 730515}},\n",
+ " {'name': 'Drosophila genome database',\n",
+ " 'id': 'DB-0026',\n",
+ " 'abbrev': 'FlyBase',\n",
+ " 'pubMedId': '30364959',\n",
+ " 'doiId': '10.1093/nar/gky1003',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://flybase.org/'],\n",
+ " 'dbUrl': 'https://flybase.org/reports/%s.html',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 3888,\n",
+ " 'unreviewedProteinCount': 24072}},\n",
+ " {'name': 'FunCoup',\n",
+ " 'id': 'DB-0279',\n",
+ " 'abbrev': 'FunCoup',\n",
+ " 'pubMedId': '39530220',\n",
+ " 'doiId': '10.1093/nar/gkae1021',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://funcoup.org/'],\n",
+ " 'dbUrl': 'https://funcoup.org/uniprot/%u/',\n",
+ " 'category': 'Protein-protein interaction databases',\n",
+ " 'statistics': {'reviewedProteinCount': 143520,\n",
+ " 'unreviewedProteinCount': 2469592}},\n",
+ " {'name': 'CATH Functional Families',\n",
+ " 'id': 'DB-0274',\n",
+ " 'abbrev': 'FunFam',\n",
+ " 'pubMedId': '26139634',\n",
+ " 'doiId': '10.1093/bioinformatics/btv398',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.cathdb.info/'],\n",
+ " 'dbUrl': 'https://www.cathdb.info/version/latest/funfam/%s',\n",
+ " 'category': 'Family and domain databases',\n",
+ " 'statistics': {'reviewedProteinCount': 327653,\n",
+ " 'unreviewedProteinCount': 43822294}},\n",
+ " {'name': 'Gene Ontology',\n",
+ " 'id': 'DB-0037',\n",
+ " 'abbrev': 'GO',\n",
+ " 'pubMedId': '36866529',\n",
+ " 'doiId': '10.1093/genetics/iyad031',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://geneontology.org/'],\n",
+ " 'dbUrl': 'https://www.ebi.ac.uk/QuickGO/term/%s',\n",
+ " 'category': 'Ontologies',\n",
+ " 'statistics': {'reviewedProteinCount': 553982,\n",
+ " 'unreviewedProteinCount': 121180446}},\n",
+ " {'name': 'Information system for G protein-coupled receptors (GPCRs)',\n",
+ " 'id': 'DB-0038',\n",
+ " 'abbrev': 'GPCRDB',\n",
+ " 'pubMedId': '39558158',\n",
+ " 'doiId': '10.1093/nar/gkae1065',\n",
+ " 'linkType': 'Implicit',\n",
+ " 'servers': ['https://gpcrdb.org'],\n",
+ " 'dbUrl': 'https://gpcrdb.org/protein/%u/',\n",
+ " 'category': 'Protein family/group databases',\n",
+ " 'statistics': {'reviewedProteinCount': 0, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'GenAtlas',\n",
+ " 'id': 'DB-0027',\n",
+ " 'abbrev': 'GenAtlas',\n",
+ " 'pubMedId': '9835018',\n",
+ " 'doiId': '10.1016/S0764-4469(99)80021-3',\n",
+ " 'linkType': 'Implicit',\n",
+ " 'servers': ['http://genatlas.medecine.univ-paris5.fr/'],\n",
+ " 'dbUrl': 'http://genatlas.medecine.univ-paris5.fr/fiche.php?symbol=%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 0, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'GenBank nucleotide sequence database',\n",
+ " 'id': 'DB-0028',\n",
+ " 'abbrev': 'GenBank',\n",
+ " 'pubMedId': '31665464',\n",
+ " 'doiId': '10.1093/nar/gkz956',\n",
+ " 'linkType': 'Implicit',\n",
+ " 'servers': ['https://www.ncbi.nlm.nih.gov/'],\n",
+ " 'dbUrl': 'https://www.ncbi.nlm.nih.gov/nuccore/%s',\n",
+ " 'category': 'Sequence databases',\n",
+ " 'statistics': {'reviewedProteinCount': 0, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'GenCC',\n",
+ " 'id': 'DB-0264',\n",
+ " 'abbrev': 'GenCC',\n",
+ " 'pubMedId': '35507016',\n",
+ " 'doiId': '10.1016/j.gim.2022.04.017',\n",
+ " 'linkType': 'Implicit',\n",
+ " 'servers': ['https://thegencc.org/'],\n",
+ " 'dbUrl': 'https://search.thegencc.org/genes/%s',\n",
+ " 'category': 'Genetic variation databases',\n",
+ " 'statistics': {'reviewedProteinCount': 0, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'Gene3D Structural and Functional Annotation of Protein Families',\n",
+ " 'id': 'DB-0029',\n",
+ " 'abbrev': 'Gene3D',\n",
+ " 'pubMedId': '29112716',\n",
+ " 'doiId': '10.1093/nar/gkx1069',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['http://www.cathdb.info/'],\n",
+ " 'dbUrl': 'http://www.cathdb.info/superfamily/%s',\n",
+ " 'category': 'Family and domain databases',\n",
+ " 'statistics': {'reviewedProteinCount': 478318,\n",
+ " 'unreviewedProteinCount': 131122919}},\n",
+ " {'name': 'GeneCards',\n",
+ " 'id': 'DB-0030',\n",
+ " 'abbrev': 'GeneCards',\n",
+ " 'pubMedId': '27322403',\n",
+ " 'doiId': '10.1002/cpbi.5',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.genecards.org/'],\n",
+ " 'dbUrl': 'https://www.genecards.org/cgi-bin/carddisp.pl?gene=%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 20247,\n",
+ " 'unreviewedProteinCount': 1402}},\n",
+ " {'name': 'Database of genes from NCBI RefSeq genomes',\n",
+ " 'id': 'DB-0118',\n",
+ " 'abbrev': 'GeneID',\n",
+ " 'pubMedId': '25355515',\n",
+ " 'doiId': '10.1093/nar/gku1055',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.ncbi.nlm.nih.gov/gene'],\n",
+ " 'dbUrl': 'https://www.ncbi.nlm.nih.gov/gene/%s',\n",
+ " 'category': 'Genome annotation databases',\n",
+ " 'statistics': {'reviewedProteinCount': 288167,\n",
+ " 'unreviewedProteinCount': 23312158}},\n",
+ " {'name': 'GeneReviews a resource of expert-authored, peer-reviewed disease descriptions.',\n",
+ " 'id': 'DB-0188',\n",
+ " 'abbrev': 'GeneReviews',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.ncbi.nlm.nih.gov/books/NBK1116'],\n",
+ " 'dbUrl': 'https://www.ncbi.nlm.nih.gov/books/NBK1116/?term=%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 1619, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'Ensembl GeneTree',\n",
+ " 'id': 'DB-0162',\n",
+ " 'abbrev': 'GeneTree',\n",
+ " 'pubMedId': '26896847',\n",
+ " 'doiId': '10.1093/database/bav096',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://ensemblgenomes.org'],\n",
+ " 'dbUrl': 'https://www.ensemblgenomes.org/id-genetree/%s',\n",
+ " 'category': 'Phylogenomic databases',\n",
+ " 'statistics': {'reviewedProteinCount': 48921,\n",
+ " 'unreviewedProteinCount': 6302199}},\n",
+ " {'name': 'The Gene Wiki collection of pages on human genes and proteins',\n",
+ " 'id': 'DB-0180',\n",
+ " 'abbrev': 'GeneWiki',\n",
+ " 'pubMedId': '22075991',\n",
+ " 'doiId': '10.1093/nar/gkr925',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://en.wikipedia.org/wiki/Portal:Gene_Wiki'],\n",
+ " 'dbUrl': 'https://en.wikipedia.org/wiki/%s',\n",
+ " 'category': 'Miscellaneous databases',\n",
+ " 'statistics': {'reviewedProteinCount': 10269, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'Database of phenotypes from RNA interference screens in Drosophila and Homo sapiens',\n",
+ " 'id': 'DB-0169',\n",
+ " 'abbrev': 'GenomeRNAi',\n",
+ " 'pubMedId': '23193271',\n",
+ " 'doiId': '10.1093/nar/gks1170',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['http://genomernai.dkfz.de/'],\n",
+ " 'dbUrl': 'http://genomernai.org/genedetails/%s',\n",
+ " 'category': 'Miscellaneous databases',\n",
+ " 'statistics': {'reviewedProteinCount': 22326, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'GlyConnect protein glycosylation platform',\n",
+ " 'id': 'DB-0227',\n",
+ " 'abbrev': 'GlyConnect',\n",
+ " 'pubMedId': '30574787',\n",
+ " 'doiId': '10.1021/acs.jproteome.8b00766',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://glyconnect.expasy.org'],\n",
+ " 'dbUrl': 'https://glyconnect.expasy.org/browser/proteins/%s',\n",
+ " 'category': 'PTM databases',\n",
+ " 'statistics': {'reviewedProteinCount': 2215, 'unreviewedProteinCount': 43}},\n",
+ " {'name': 'GlyCosmos Portal integrating glycosciences with life sciences',\n",
+ " 'id': 'DB-0267',\n",
+ " 'abbrev': 'GlyCosmos',\n",
+ " 'pubMedId': '32572234',\n",
+ " 'doiId': '10.1038/s41592-020-0879-8',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://glycosmos.org/'],\n",
+ " 'dbUrl': 'https://glycosmos.org/glycoproteins/show/uniprot/%u',\n",
+ " 'category': 'PTM databases',\n",
+ " 'statistics': {'reviewedProteinCount': 28908,\n",
+ " 'unreviewedProteinCount': 37516}},\n",
+ " {'name': 'GlyGen',\n",
+ " 'id': 'DB-0254',\n",
+ " 'abbrev': 'GlyGen',\n",
+ " 'pubMedId': '31616925',\n",
+ " 'doiId': '10.1093/glycob/cwz080',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://glygen.org'],\n",
+ " 'dbUrl': 'https://glygen.org/protein/%u#glycosylation',\n",
+ " 'category': 'PTM databases',\n",
+ " 'statistics': {'reviewedProteinCount': 39091,\n",
+ " 'unreviewedProteinCount': 24238}},\n",
+ " {'name': 'Gramene; a comparative resource for plants',\n",
+ " 'id': 'DB-0039',\n",
+ " 'abbrev': 'Gramene',\n",
+ " 'pubMedId': '35037202',\n",
+ " 'doiId': '10.1007/978-1-0716-2067-0_5',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.gramene.org/'],\n",
+ " 'dbUrl': 'https://ensembl.gramene.org/id/%s',\n",
+ " 'category': 'Genome annotation databases',\n",
+ " 'statistics': {'reviewedProteinCount': 22301,\n",
+ " 'unreviewedProteinCount': 3785988}},\n",
+ " {'name': 'IUPHAR/BPS Guide to PHARMACOLOGY',\n",
+ " 'id': 'DB-0182',\n",
+ " 'abbrev': 'GuidetoPHARMACOLOGY',\n",
+ " 'pubMedId': '34718737',\n",
+ " 'doiId': '10.1093/nar/gkab1010',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.guidetopharmacology.org'],\n",
+ " 'dbUrl': 'https://www.guidetopharmacology.org/GRAC/ObjectDisplayForward?objectId=%s',\n",
+ " 'category': 'Chemistry databases',\n",
+ " 'statistics': {'reviewedProteinCount': 2299, 'unreviewedProteinCount': 18}},\n",
+ " {'name': 'HAMAP database of protein families',\n",
+ " 'id': 'DB-0041',\n",
+ " 'abbrev': 'HAMAP',\n",
+ " 'pubMedId': '25348399',\n",
+ " 'doiId': '10.1093/nar/gku1002',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://hamap.expasy.org/'],\n",
+ " 'dbUrl': 'https://hamap.expasy.org/signature/%s',\n",
+ " 'category': 'Family and domain databases',\n",
+ " 'statistics': {'reviewedProteinCount': 328099,\n",
+ " 'unreviewedProteinCount': 17907362}},\n",
+ " {'name': 'Human Gene Nomenclature Database',\n",
+ " 'id': 'DB-0042',\n",
+ " 'abbrev': 'HGNC',\n",
+ " 'pubMedId': '33152070',\n",
+ " 'doiId': '10.1093/nar/gkaa980',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.genenames.org/'],\n",
+ " 'dbUrl': 'https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 20256,\n",
+ " 'unreviewedProteinCount': 62135}},\n",
+ " {'name': 'The HOGENOM Database of Homologous Genes from Fully Sequenced Organisms',\n",
+ " 'id': 'DB-0044',\n",
+ " 'abbrev': 'HOGENOM',\n",
+ " 'pubMedId': '19534752',\n",
+ " 'doiId': '10.1186/1471-2105-10-S6-S3',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['http://hogenom.univ-lyon1.fr/'],\n",
+ " 'dbUrl': 'http://hogenom.univ-lyon1.fr/query_sequence?seq=%u',\n",
+ " 'category': 'Phylogenomic databases',\n",
+ " 'statistics': {'reviewedProteinCount': 428629,\n",
+ " 'unreviewedProteinCount': 12712505}},\n",
+ " {'name': 'Human Protein Atlas',\n",
+ " 'id': 'DB-0046',\n",
+ " 'abbrev': 'HPA',\n",
+ " 'pubMedId': '25613900',\n",
+ " 'doiId': '10.1126/science.1260419',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.proteinatlas.org/'],\n",
+ " 'dbUrl': 'https://www.proteinatlas.org/%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 19215, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'Human Unidentified Gene-Encoded large proteins database',\n",
+ " 'id': 'DB-0049',\n",
+ " 'abbrev': 'HUGE',\n",
+ " 'pubMedId': '14681467',\n",
+ " 'doiId': '10.1093/nar/gkh035',\n",
+ " 'linkType': 'Implicit',\n",
+ " 'servers': ['http://www.kazusa.or.jp/huge/'],\n",
+ " 'dbUrl': 'http://www.kazusa.or.jp/huge/gfpage/%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 0, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'Intrinsically Disordered proteins with Extensive Annotations and Literature',\n",
+ " 'id': 'DB-0251',\n",
+ " 'abbrev': 'IDEAL',\n",
+ " 'pubMedId': '24178034',\n",
+ " 'doiId': '10.1093/nar/gkt1010',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.ideal-db.org/'],\n",
+ " 'dbUrl': 'https://www.ideal-db.org/ideal.php?id=%s',\n",
+ " 'category': 'Family and domain databases',\n",
+ " 'statistics': {'reviewedProteinCount': 1101, 'unreviewedProteinCount': 9}},\n",
+ " {'name': 'The international ImMunoGeneTics information system',\n",
+ " 'id': 'DB-0050',\n",
+ " 'abbrev': 'IMGT_GENE-DB',\n",
+ " 'pubMedId': '25378316',\n",
+ " 'doiId': '10.1093/nar/gku1056',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.imgt.org/genedb'],\n",
+ " 'dbUrl': 'https://www.imgt.org/IMGT_GENE-DB/GENElect?query=2+%s&species=Homo+sapiens',\n",
+ " 'category': 'Protein family/group databases',\n",
+ " 'statistics': {'reviewedProteinCount': 267, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'InParanoid',\n",
+ " 'id': 'DB-0146',\n",
+ " 'abbrev': 'InParanoid',\n",
+ " 'pubMedId': '36764355',\n",
+ " 'doiId': '10.1016/j.jmb.2023.168001',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://inparanoidb.sbc.su.se/'],\n",
+ " 'dbUrl': 'https://inparanoidb.sbc.su.se/orthologs/%u&1/',\n",
+ " 'category': 'Phylogenomic databases',\n",
+ " 'statistics': {'reviewedProteinCount': 164709,\n",
+ " 'unreviewedProteinCount': 6521465}},\n",
+ " {'name': 'Protein interaction database and analysis system',\n",
+ " 'id': 'DB-0051',\n",
+ " 'abbrev': 'IntAct',\n",
+ " 'pubMedId': '24234451',\n",
+ " 'doiId': '10.1093/nar/gkt1115',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.ebi.ac.uk/intact/'],\n",
+ " 'dbUrl': 'https://www.ebi.ac.uk/intact/interactors/id:%s*',\n",
+ " 'category': 'Protein-protein interaction databases',\n",
+ " 'statistics': {'reviewedProteinCount': 57960,\n",
+ " 'unreviewedProteinCount': 46366}},\n",
+ " {'name': 'Integrated resource of protein families, domains and functional sites',\n",
+ " 'id': 'DB-0052',\n",
+ " 'abbrev': 'InterPro',\n",
+ " 'pubMedId': '36350672',\n",
+ " 'doiId': '10.1093/nar/gkac993',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.ebi.ac.uk/interpro/'],\n",
+ " 'dbUrl': 'https://www.ebi.ac.uk/interpro/entry/%s',\n",
+ " 'category': 'Family and domain databases',\n",
+ " 'statistics': {'reviewedProteinCount': 556937,\n",
+ " 'unreviewedProteinCount': 162864033}},\n",
+ " {'name': 'Schizosaccharomyces japonicus model organism database',\n",
+ " 'id': 'DB-0273',\n",
+ " 'abbrev': 'JaponicusDB',\n",
+ " 'pubMedId': '35380656',\n",
+ " 'doiId': '10.1093/genetics/iyab223',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.japonicusdb.org'],\n",
+ " 'dbUrl': 'https://www.japonicusdb.org/gene/%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 43, 'unreviewedProteinCount': 4798}},\n",
+ " {'name': 'KEGG',\n",
+ " 'id': 'DB-0053',\n",
+ " 'abbrev': 'KEGG',\n",
+ " 'pubMedId': '39417505',\n",
+ " 'doiId': '10.1093/nar/gkae909',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.genome.jp/kegg/'],\n",
+ " 'dbUrl': 'https://www.genome.jp/dbget-bin/www_bget?%s',\n",
+ " 'category': 'Genome annotation databases',\n",
+ " 'statistics': {'reviewedProteinCount': 484511,\n",
+ " 'unreviewedProteinCount': 22466947}},\n",
+ " {'name': 'Legionella pneumophila genome database',\n",
+ " 'id': 'DB-0054',\n",
+ " 'abbrev': 'LegioList',\n",
+ " 'pubMedId': '18032431',\n",
+ " 'doiId': '10.1093/nar/gkm1042',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['http://genolist.pasteur.fr/LegioList/'],\n",
+ " 'dbUrl': 'http://genolist.pasteur.fr/LegioList/genome.cgi?external_query+%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 763, 'unreviewedProteinCount': 2483}},\n",
+ " {'name': 'Mycobacterium leprae genome database',\n",
+ " 'id': 'DB-0055',\n",
+ " 'abbrev': 'Leproma',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://mycobrowser.epfl.ch/'],\n",
+ " 'dbUrl': 'https://mycobrowser.epfl.ch/genes/%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 669, 'unreviewedProteinCount': 1269}},\n",
+ " {'name': 'Matched Annotation from NCBI and EMBL-EBI (MANE) - Phase one',\n",
+ " 'id': 'DB-0261',\n",
+ " 'abbrev': 'MANE-Select',\n",
+ " 'pubMedId': '35388217',\n",
+ " 'doiId': '10.1038/s41586-022-04558-8',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.ensembl.org/info/genome/genebuild/mane.html',\n",
+ " 'https://www.ncbi.nlm.nih.gov/refseq/MANE/'],\n",
+ " 'dbUrl': 'https://www.ensembl.org/id/%s',\n",
+ " 'category': 'Genome annotation databases',\n",
+ " 'statistics': {'reviewedProteinCount': 18480,\n",
+ " 'unreviewedProteinCount': 692}},\n",
+ " {'name': 'MEROPS protease database',\n",
+ " 'id': 'DB-0059',\n",
+ " 'abbrev': 'MEROPS',\n",
+ " 'pubMedId': '29145643',\n",
+ " 'doiId': '10.1093/nar/gkx1134',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.ebi.ac.uk/merops/'],\n",
+ " 'dbUrl': 'https://www.ebi.ac.uk/merops/cgi-bin/pepsum?id=%s',\n",
+ " 'category': 'Protein family/group databases',\n",
+ " 'statistics': {'reviewedProteinCount': 13845,\n",
+ " 'unreviewedProteinCount': 251607}},\n",
+ " {'name': 'Mouse genome database (MGD) from Mouse Genome Informatics (MGI)',\n",
+ " 'id': 'DB-0060',\n",
+ " 'abbrev': 'MGI',\n",
+ " 'pubMedId': '38531069',\n",
+ " 'doiId': '10.1093/genetics/iyae031',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.informatics.jax.org/'],\n",
+ " 'dbUrl': 'https://www.informatics.jax.org/marker/%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 17125,\n",
+ " 'unreviewedProteinCount': 62472}},\n",
+ " {'name': 'Online Mendelian Inheritance in Man (OMIM)',\n",
+ " 'id': 'DB-0062',\n",
+ " 'abbrev': 'MIM',\n",
+ " 'pubMedId': '30445645',\n",
+ " 'doiId': '10.1093/nar/gky1151',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.omim.org/'],\n",
+ " 'dbUrl': 'https://www.omim.org/entry/%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 16578, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'Molecular INTeraction database',\n",
+ " 'id': 'DB-0158',\n",
+ " 'abbrev': 'MINT',\n",
+ " 'pubMedId': '22096227',\n",
+ " 'doiId': '10.1093/nar/gkr930',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://mint.bio.uniroma2.it/'],\n",
+ " 'dbUrl': 'https://mint.bio.uniroma2.it/cgi-bin/protein.py?id=%u',\n",
+ " 'category': 'Protein-protein interaction databases',\n",
+ " 'statistics': {'reviewedProteinCount': 24151,\n",
+ " 'unreviewedProteinCount': 3651}},\n",
+ " {'name': 'Maize Genetics and Genomics Database',\n",
+ " 'id': 'DB-0058',\n",
+ " 'abbrev': 'MaizeGDB',\n",
+ " 'pubMedId': '34416864',\n",
+ " 'doiId': '10.1186/s12870-021-03173-5',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.maizegdb.org/'],\n",
+ " 'dbUrl': 'https://www.maizegdb.org/data_center/gene_product?id=%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 525, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'MalaCards human disease database',\n",
+ " 'id': 'DB-0196',\n",
+ " 'abbrev': 'MalaCards',\n",
+ " 'pubMedId': '27899610',\n",
+ " 'doiId': '10.1093/nar/gkw1012',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.malacards.org'],\n",
+ " 'dbUrl': 'https://www.malacards.org/search/eliteGene/%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 7383, 'unreviewedProteinCount': 105}},\n",
+ " {'name': 'MassIVE - Mass Spectrometry Interactive Virtual Environment',\n",
+ " 'id': 'DB-0241',\n",
+ " 'abbrev': 'MassIVE',\n",
+ " 'pubMedId': '30172843',\n",
+ " 'doiId': '10.1016/j.cels.2018.08.004',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://massive.ucsd.edu/'],\n",
+ " 'dbUrl': 'https://massive.ucsd.edu/ProteoSAFe/protein_explorer.jsp?libraries=2&protein_name=%u',\n",
+ " 'category': 'Proteomic databases',\n",
+ " 'statistics': {'reviewedProteinCount': 19140,\n",
+ " 'unreviewedProteinCount': 41876}},\n",
+ " {'name': 'MetOSite database of methionine sulfoxide sites',\n",
+ " 'id': 'DB-0247',\n",
+ " 'abbrev': 'MetOSite',\n",
+ " 'pubMedId': '31197322',\n",
+ " 'doiId': '10.1093/bioinformatics/btz462',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://metosite.uma.es/'],\n",
+ " 'dbUrl': 'https://metosite.uma.es/scan/%u',\n",
+ " 'category': 'PTM databases',\n",
+ " 'statistics': {'reviewedProteinCount': 3455, 'unreviewedProteinCount': 745}},\n",
+ " {'name': 'MobiDB',\n",
+ " 'id': 'DB-0183',\n",
+ " 'abbrev': 'MobiDB',\n",
+ " 'pubMedId': '36416266',\n",
+ " 'doiId': '10.1093/nar/gkac1065',\n",
+ " 'linkType': 'Implicit',\n",
+ " 'servers': ['https://mobidb.org'],\n",
+ " 'dbUrl': 'https://mobidb.org/%u',\n",
+ " 'category': 'Family and domain databases',\n",
+ " 'statistics': {'reviewedProteinCount': 0, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'Database of comparative protein structure models',\n",
+ " 'id': 'DB-0063',\n",
+ " 'abbrev': 'ModBase',\n",
+ " 'pubMedId': '24271400',\n",
+ " 'doiId': '10.1093/nar/gkt1144',\n",
+ " 'linkType': 'Implicit',\n",
+ " 'servers': ['https://modbase.compbio.ucsf.edu/'],\n",
+ " 'dbUrl': 'https://salilab.org/modbase-cgi/model_search.cgi?searchkw=name&kword=%u',\n",
+ " 'category': '3D structure databases',\n",
+ " 'statistics': {'reviewedProteinCount': 0, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'MoonDB Database of extreme multifunctional and moonlighting proteins',\n",
+ " 'id': 'DB-0230',\n",
+ " 'abbrev': 'MoonDB',\n",
+ " 'pubMedId': '30371819',\n",
+ " 'doiId': '10.1093/nar/gky1039',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['http://moondb.hb.univ-amu.fr'],\n",
+ " 'dbUrl': 'http://moondb.hb.univ-amu.fr/protein/%u',\n",
+ " 'category': 'Protein family/group databases',\n",
+ " 'statistics': {'reviewedProteinCount': 348, 'unreviewedProteinCount': 1}},\n",
+ " {'name': 'MoonProt database of moonlighting proteins',\n",
+ " 'id': 'DB-0189',\n",
+ " 'abbrev': 'MoonProt',\n",
+ " 'pubMedId': '29126295',\n",
+ " 'doiId': '10.1093/nar/gkx1043',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['http://www.moonlightingproteins.org/'],\n",
+ " 'dbUrl': 'http://www.moonlightingproteins.org/proteins/?q=%u',\n",
+ " 'category': 'Protein family/group databases',\n",
+ " 'statistics': {'reviewedProteinCount': 368, 'unreviewedProteinCount': 59}},\n",
+ " {'name': 'NCBIfam',\n",
+ " 'id': 'DB-0270',\n",
+ " 'abbrev': 'NCBIfam',\n",
+ " 'pubMedId': '33270901',\n",
+ " 'doiId': '10.1093/nar/gkaa1105',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.ncbi.nlm.nih.gov/genome/annotation_prok/evidence/',\n",
+ " 'https://www.ncbi.nlm.nih.gov/refseq/annotation_prok/tigrfams/'],\n",
+ " 'dbUrl': 'https://www.ncbi.nlm.nih.gov/genome/annotation_prok/evidence/%s/',\n",
+ " 'category': 'Family and domain databases',\n",
+ " 'statistics': {'reviewedProteinCount': 347231,\n",
+ " 'unreviewedProteinCount': 40960604}},\n",
+ " {'name': 'NIAGADS Genomics Database',\n",
+ " 'id': 'DB-0237',\n",
+ " 'abbrev': 'NIAGADS',\n",
+ " 'pubMedId': '40545618',\n",
+ " 'doiId': '10.1002/alz.70255',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.niagads.org/genomics/'],\n",
+ " 'dbUrl': 'https://www.niagads.org/genomics/app/record/gene/%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 76, 'unreviewedProteinCount': 252}},\n",
+ " {'name': 'USC-OGP 2-DE database',\n",
+ " 'id': 'DB-0067',\n",
+ " 'abbrev': 'OGP',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['http://usc_ogp_2ddatabase.cesga.es/cgi-bin/2d/2d.cgi'],\n",
+ " 'dbUrl': 'http://usc_ogp_2ddatabase.cesga.es/cgi-bin/2d/2d.cgi?%s',\n",
+ " 'category': '2D gel databases',\n",
+ " 'statistics': {'reviewedProteinCount': 373, 'unreviewedProteinCount': 3}},\n",
+ " {'name': 'Identification of Orthologs from Complete Genome Data',\n",
+ " 'id': 'DB-0137',\n",
+ " 'abbrev': 'OMA',\n",
+ " 'pubMedId': '33174605',\n",
+ " 'doiId': '10.1093/nar/gkaa1007',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://omabrowser.org/oma/home/'],\n",
+ " 'dbUrl': 'https://omabrowser.org/oma/group/%u',\n",
+ " 'category': 'Phylogenomic databases',\n",
+ " 'statistics': {'reviewedProteinCount': 120955,\n",
+ " 'unreviewedProteinCount': 8091120}},\n",
+ " {'name': 'Open Targets',\n",
+ " 'id': 'DB-0219',\n",
+ " 'abbrev': 'OpenTargets',\n",
+ " 'pubMedId': '36399499',\n",
+ " 'doiId': '10.1093/nar/gkac1046',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://platform.opentargets.org/'],\n",
+ " 'dbUrl': 'https://platform.opentargets.org/target/%s/associations',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 18469,\n",
+ " 'unreviewedProteinCount': 59660}},\n",
+ " {'name': 'Orphanet; a database dedicated to information on rare diseases and orphan drugs',\n",
+ " 'id': 'DB-0068',\n",
+ " 'abbrev': 'Orphanet',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.orpha.net'],\n",
+ " 'dbUrl': 'https://www.orpha.net/en/disease/detail/%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 4441, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'Database of Orthologous Groups',\n",
+ " 'id': 'DB-0143',\n",
+ " 'abbrev': 'OrthoDB',\n",
+ " 'pubMedId': '39535043',\n",
+ " 'doiId': '10.1093/nar/gkae987',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.orthodb.org'],\n",
+ " 'dbUrl': 'https://www.orthodb.org/?gene=%u',\n",
+ " 'category': 'Phylogenomic databases',\n",
+ " 'statistics': {'reviewedProteinCount': 270718,\n",
+ " 'unreviewedProteinCount': 56844461}},\n",
+ " {'name': 'The PAN-GO gene functionome',\n",
+ " 'id': 'DB-0280',\n",
+ " 'abbrev': 'PAN-GO',\n",
+ " 'pubMedId': '40011791',\n",
+ " 'doiId': '10.1038/s41586-025-08592-0',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://functionome.geneontology.org/'],\n",
+ " 'dbUrl': 'https://functionome.geneontology.org/gene/UniProtKB:%u',\n",
+ " 'category': 'Phylogenomic databases',\n",
+ " 'statistics': {'reviewedProteinCount': 20210,\n",
+ " 'unreviewedProteinCount': 395}},\n",
+ " {'name': 'The PANTHER Classification System',\n",
+ " 'id': 'DB-0069',\n",
+ " 'abbrev': 'PANTHER',\n",
+ " 'pubMedId': '33290554',\n",
+ " 'doiId': '10.1093/nar/gkaa1106',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.pantherdb.org/'],\n",
+ " 'dbUrl': 'https://www.pantherdb.org/panther/family.do?clsAccession=%s',\n",
+ " 'category': 'Family and domain databases',\n",
+ " 'statistics': {'reviewedProteinCount': 505734,\n",
+ " 'unreviewedProteinCount': 138423061}},\n",
+ " {'name': 'Pathosystems Resource Integration Center (PATRIC)',\n",
+ " 'id': 'DB-0165',\n",
+ " 'abbrev': 'PATRIC',\n",
+ " 'pubMedId': '31667520',\n",
+ " 'doiId': '10.1093/nar/gkz943',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://patricbrc.org/'],\n",
+ " 'dbUrl': 'https://www.patricbrc.org/view/Feature/%s',\n",
+ " 'category': 'Genome annotation databases',\n",
+ " 'statistics': {'reviewedProteinCount': 93326,\n",
+ " 'unreviewedProteinCount': 7297376}},\n",
+ " {'name': 'The Protein Circular Dichroism Data Bank',\n",
+ " 'id': 'DB-0257',\n",
+ " 'abbrev': 'PCDDB',\n",
+ " 'pubMedId': '34999124',\n",
+ " 'doiId': '10.1016/j.jmb.2022.167441',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://pcddb.cryst.bbk.ac.uk/'],\n",
+ " 'dbUrl': 'https://pcddb.cryst.bbk.ac.uk/uniprot/%u',\n",
+ " 'category': '3D structure databases',\n",
+ " 'statistics': {'reviewedProteinCount': 134, 'unreviewedProteinCount': 16}},\n",
+ " {'name': 'Protein Data Bank Europe',\n",
+ " 'id': 'DB-0070',\n",
+ " 'abbrev': 'PDB',\n",
+ " 'pubMedId': '31691821',\n",
+ " 'doiId': '10.1093/nar/gkz990',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.ebi.ac.uk/pdbe/'],\n",
+ " 'dbUrl': 'https://www.ebi.ac.uk/pdbe-srv/view/entry/%s',\n",
+ " 'category': '3D structure databases',\n",
+ " 'statistics': {'reviewedProteinCount': 38379,\n",
+ " 'unreviewedProteinCount': 32500}},\n",
+ " {'name': 'Protein Data Bank in Europe - Knowledge Base',\n",
+ " 'id': 'DB-0244',\n",
+ " 'abbrev': 'PDBe-KB',\n",
+ " 'pubMedId': '31584092',\n",
+ " 'doiId': '10.1093/nar/gkz853',\n",
+ " 'linkType': 'Implicit',\n",
+ " 'servers': ['https://www.ebi.ac.uk/pdbe/pdbe-kb/'],\n",
+ " 'dbUrl': 'https://www.ebi.ac.uk/pdbe/pdbe-kb/proteins/%u',\n",
+ " 'category': '3D structure databases',\n",
+ " 'statistics': {'reviewedProteinCount': 0, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'Protein Data Bank Japan',\n",
+ " 'id': 'DB-0172',\n",
+ " 'abbrev': 'PDBj',\n",
+ " 'pubMedId': '34664328',\n",
+ " 'doiId': '10.1002/pro.4211',\n",
+ " 'linkType': 'Implicit',\n",
+ " 'servers': ['https://pdbj.org/'],\n",
+ " 'dbUrl': 'https://pdbj.org/mine/summary/%s',\n",
+ " 'category': '3D structure databases',\n",
+ " 'statistics': {'reviewedProteinCount': 0, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'PDBsum; at-a-glance overview of macromolecular structures',\n",
+ " 'id': 'DB-0119',\n",
+ " 'abbrev': 'PDBsum',\n",
+ " 'pubMedId': '28875543',\n",
+ " 'doiId': '10.1002/pro.3289',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.ebi.ac.uk/pdbsum/'],\n",
+ " 'dbUrl': 'https://www.ebi.ac.uk/pdbsum/%s',\n",
+ " 'category': '3D structure databases',\n",
+ " 'statistics': {'reviewedProteinCount': 38379,\n",
+ " 'unreviewedProteinCount': 18893}},\n",
+ " {'name': 'Pathogen-Host Interaction database',\n",
+ " 'id': 'DB-0248',\n",
+ " 'abbrev': 'PHI-base',\n",
+ " 'pubMedId': '27915230',\n",
+ " 'doiId': '10.1093/nar/gkw1089',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['http://www.phi-base.org/'],\n",
+ " 'dbUrl': 'http://www.phi-base.org/searchFacet.htm?queryTerm=%s',\n",
+ " 'category': 'Miscellaneous databases',\n",
+ " 'statistics': {'reviewedProteinCount': 1949,\n",
+ " 'unreviewedProteinCount': 6141}},\n",
+ " {'name': 'Protein sequence database of the Protein Information Resource',\n",
+ " 'id': 'DB-0078',\n",
+ " 'abbrev': 'PIR',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://proteininformationresource.org/'],\n",
+ " 'dbUrl': 'https://proteininformationresource.org/cgi-bin/nbrfget?uid=%s',\n",
+ " 'category': 'Sequence databases',\n",
+ " 'statistics': {'reviewedProteinCount': 114939,\n",
+ " 'unreviewedProteinCount': 123055}},\n",
+ " {'name': 'PIRSF; a whole-protein classification database',\n",
+ " 'id': 'DB-0079',\n",
+ " 'abbrev': 'PIRSF',\n",
+ " 'pubMedId': '14681371',\n",
+ " 'doiId': '10.1093/nar/gkh097',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://proteininformationresource.org/pirwww/dbinfo/pirsf.shtml',\n",
+ " 'https://www.ebi.ac.uk/interpro/entry/pirsf/'],\n",
+ " 'dbUrl': 'https://www.ebi.ac.uk/interpro/entry/pirsf/%s',\n",
+ " 'category': 'Family and domain databases',\n",
+ " 'statistics': {'reviewedProteinCount': 109925,\n",
+ " 'unreviewedProteinCount': 15679593}},\n",
+ " {'name': 'PRoteomics IDEntifications database',\n",
+ " 'id': 'DB-0130',\n",
+ " 'abbrev': 'PRIDE',\n",
+ " 'pubMedId': '30395289',\n",
+ " 'doiId': '10.1093/nar/gky1106',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.ebi.ac.uk/pride'],\n",
+ " 'dbUrl': 'https://www.ebi.ac.uk/pride/searchSummary.do?queryTypeSelected=identification%20accession%20number&identificationAccessionNumber=%s',\n",
+ " 'category': 'Proteomic databases',\n",
+ " 'statistics': {'reviewedProteinCount': 637, 'unreviewedProteinCount': 444}},\n",
+ " {'name': 'Protein Motif fingerprint database; a protein domain database',\n",
+ " 'id': 'DB-0082',\n",
+ " 'abbrev': 'PRINTS',\n",
+ " 'pubMedId': '22508994',\n",
+ " 'doiId': '10.1093/database/bas019',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['http://www.bioinf.manchester.ac.uk/dbbrowser/PRINTS/'],\n",
+ " 'dbUrl': 'https://www.ebi.ac.uk/interpro/entry/prints/%s',\n",
+ " 'category': 'Family and domain databases',\n",
+ " 'statistics': {'reviewedProteinCount': 130114,\n",
+ " 'unreviewedProteinCount': 28453191}},\n",
+ " {'name': 'Protein Ontology',\n",
+ " 'id': 'DB-0181',\n",
+ " 'abbrev': 'PRO',\n",
+ " 'pubMedId': '27899649',\n",
+ " 'doiId': '10.1093/nar/gkw1075',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://proconsortium.org/'],\n",
+ " 'dbUrl': 'https://proconsortium.org/cgi-bin/entry_pro?id=%s',\n",
+ " 'category': 'Miscellaneous databases',\n",
+ " 'statistics': {'reviewedProteinCount': 100156,\n",
+ " 'unreviewedProteinCount': 2029}},\n",
+ " {'name': 'PROSITE; a protein domain and family database',\n",
+ " 'id': 'DB-0084',\n",
+ " 'abbrev': 'PROSITE',\n",
+ " 'pubMedId': '23161676',\n",
+ " 'doiId': '10.1093/nar/gks1067',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://prosite.expasy.org/'],\n",
+ " 'dbUrl': 'https://prosite.expasy.org/doc/%s',\n",
+ " 'category': 'Family and domain databases',\n",
+ " 'statistics': {'reviewedProteinCount': 313384,\n",
+ " 'unreviewedProteinCount': 78475457}},\n",
+ " {'name': 'Pathway Commons web resource for biological pathway data',\n",
+ " 'id': 'DB-0253',\n",
+ " 'abbrev': 'PathwayCommons',\n",
+ " 'pubMedId': '31647099',\n",
+ " 'doiId': '10.1093/nar/gkz946/5606621',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.pathwaycommons.org'],\n",
+ " 'dbUrl': 'https://apps.pathwaycommons.org/search?q=%u',\n",
+ " 'category': 'Enzyme and pathway databases',\n",
+ " 'statistics': {'reviewedProteinCount': 19433, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'PaxDb, a database of protein abundance averages across all three domains of life',\n",
+ " 'id': 'DB-0173',\n",
+ " 'abbrev': 'PaxDb',\n",
+ " 'pubMedId': '25656970',\n",
+ " 'doiId': '10.1002/pmic.201400441',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://pax-db.org'],\n",
+ " 'dbUrl': 'https://pax-db.org/uniprot_redirect/%s',\n",
+ " 'category': 'Proteomic databases',\n",
+ " 'statistics': {'reviewedProteinCount': 154216,\n",
+ " 'unreviewedProteinCount': 1422200}},\n",
+ " {'name': 'PeptideAtlas',\n",
+ " 'id': 'DB-0071',\n",
+ " 'abbrev': 'PeptideAtlas',\n",
+ " 'pubMedId': '16381952',\n",
+ " 'doiId': '10.1093/nar/gkj040',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://peptideatlas.org'],\n",
+ " 'dbUrl': 'https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/Search?action=GO&search_key=%s',\n",
+ " 'category': 'Proteomic databases',\n",
+ " 'statistics': {'reviewedProteinCount': 38922,\n",
+ " 'unreviewedProteinCount': 113634}},\n",
+ " {'name': 'PeroxiBase, a peroxidase database',\n",
+ " 'id': 'DB-0072',\n",
+ " 'abbrev': 'PeroxiBase',\n",
+ " 'pubMedId': '23180785',\n",
+ " 'doiId': '10.1093/nar/gks1083',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://peroxibase.toulouse.inra.fr/'],\n",
+ " 'dbUrl': 'https://peroxibase.toulouse.inra.fr/display_perox/view_perox/%s',\n",
+ " 'category': 'Protein family/group databases',\n",
+ " 'statistics': {'reviewedProteinCount': 773, 'unreviewedProteinCount': 2411}},\n",
+ " {'name': 'Pfam protein domain database',\n",
+ " 'id': 'DB-0073',\n",
+ " 'abbrev': 'Pfam',\n",
+ " 'pubMedId': '33125078',\n",
+ " 'doiId': '10.1093/nar/gkaa913',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.ebi.ac.uk/interpro/'],\n",
+ " 'dbUrl': 'https://www.ebi.ac.uk/interpro/entry/pfam/%s',\n",
+ " 'category': 'Family and domain databases',\n",
+ " 'statistics': {'reviewedProteinCount': 547272,\n",
+ " 'unreviewedProteinCount': 153461668}},\n",
+ " {'name': 'Pharos NIH Druggable Genome Knowledgebase',\n",
+ " 'id': 'DB-0240',\n",
+ " 'abbrev': 'Pharos',\n",
+ " 'pubMedId': '27903890',\n",
+ " 'doiId': '10.1093/nar/gkw1072',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://pharos.nih.gov'],\n",
+ " 'dbUrl': 'https://pharos.nih.gov/targets/%u',\n",
+ " 'category': 'Miscellaneous databases',\n",
+ " 'statistics': {'reviewedProteinCount': 20192, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'Comprehensive resource for the study of protein post-translational modifications (PTMs) in human, mouse and rat.',\n",
+ " 'id': 'DB-0123',\n",
+ " 'abbrev': 'PhosphoSitePlus',\n",
+ " 'pubMedId': '25514926',\n",
+ " 'doiId': '10.1093/nar/gku1267',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.phosphosite.org'],\n",
+ " 'dbUrl': 'https://www.phosphosite.org/uniprotAccAction?id=%u',\n",
+ " 'category': 'PTM databases',\n",
+ " 'statistics': {'reviewedProteinCount': 42259,\n",
+ " 'unreviewedProteinCount': 5928}},\n",
+ " {'name': 'Database for complete collections of gene phylogenies',\n",
+ " 'id': 'DB-0144',\n",
+ " 'abbrev': 'PhylomeDB',\n",
+ " 'pubMedId': '34718760',\n",
+ " 'doiId': '10.1093/nar/gkab966',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://phylomedb.org/'],\n",
+ " 'dbUrl': 'https://phylomedb.org/search_phylome/?seqid=%u',\n",
+ " 'category': 'Phylogenomic databases',\n",
+ " 'statistics': {'reviewedProteinCount': 115823,\n",
+ " 'unreviewedProteinCount': 572335}},\n",
+ " {'name': 'Reactome - a knowledgebase of biological pathways and processes for plant species',\n",
+ " 'id': 'DB-0243',\n",
+ " 'abbrev': 'PlantReactome',\n",
+ " 'pubMedId': '37986220',\n",
+ " 'doiId': '10.1093/nar/gkad1052',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://plantreactome.gramene.org/index.php?lang=en'],\n",
+ " 'dbUrl': 'https://plantreactome.gramene.org/PathwayBrowser/#/%s&FLG=%u',\n",
+ " 'category': 'Enzyme and pathway databases',\n",
+ " 'statistics': {'reviewedProteinCount': 824, 'unreviewedProteinCount': 925}},\n",
+ " {'name': 'Schizosaccharomyces pombe database',\n",
+ " 'id': 'DB-0031',\n",
+ " 'abbrev': 'PomBase',\n",
+ " 'pubMedId': '38376816',\n",
+ " 'doiId': '10.1093/genetics/iyae007',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.pombase.org/'],\n",
+ " 'dbUrl': 'https://www.pombase.org/gene/%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 5131, 'unreviewedProteinCount': 2}},\n",
+ " {'name': 'Protein Mass spectra EXtraction',\n",
+ " 'id': 'DB-0124',\n",
+ " 'abbrev': 'ProMEX',\n",
+ " 'pubMedId': '22685450',\n",
+ " 'doiId': '10.3389/fpls.2012.00125',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['http://promex.pph.univie.ac.at/promex/'],\n",
+ " 'dbUrl': 'http://promex.pph.univie.ac.at/promex/?ac=%s',\n",
+ " 'category': 'Proteomic databases',\n",
+ " 'statistics': {'reviewedProteinCount': 489, 'unreviewedProteinCount': 2274}},\n",
+ " {'name': 'Proteomes',\n",
+ " 'id': 'DB-0191',\n",
+ " 'abbrev': 'Proteomes',\n",
+ " 'pubMedId': '22102590',\n",
+ " 'doiId': '10.1093/nar/gkr981',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.uniprot.org/proteomes'],\n",
+ " 'dbUrl': 'https://www.uniprot.org/proteomes/%s',\n",
+ " 'category': 'Miscellaneous databases',\n",
+ " 'statistics': {'reviewedProteinCount': 450492,\n",
+ " 'unreviewedProteinCount': 178540290}},\n",
+ " {'name': 'ProteomicsDB',\n",
+ " 'id': 'DB-0229',\n",
+ " 'abbrev': 'ProteomicsDB',\n",
+ " 'pubMedId': '31665479',\n",
+ " 'doiId': '10.1093/nar/gkz974',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.proteomicsdb.org/'],\n",
+ " 'dbUrl': 'https://www.proteomicsdb.org/proteomicsdb/#protein/proteinDetails/%s',\n",
+ " 'category': 'Proteomic databases',\n",
+ " 'statistics': {'reviewedProteinCount': 45501,\n",
+ " 'unreviewedProteinCount': 70260}},\n",
+ " {'name': 'Pseudomonas genome database',\n",
+ " 'id': 'DB-0086',\n",
+ " 'abbrev': 'PseudoCAP',\n",
+ " 'pubMedId': '26578582',\n",
+ " 'doiId': '10.1093/nar/gkv1227',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.pseudomonas.com/'],\n",
+ " 'dbUrl': 'https://www.pseudomonas.com/feature/show?locus_tag=%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 2054,\n",
+ " 'unreviewedProteinCount': 4085}},\n",
+ " {'name': 'Pumba database of electrophoretic reference migration patterns',\n",
+ " 'id': 'DB-0271',\n",
+ " 'abbrev': 'Pumba',\n",
+ " 'pubMedId': '36581244',\n",
+ " 'doiId': '10.1016/j.jmb.2022.167933',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://pumba.dcsr.unil.ch'],\n",
+ " 'dbUrl': 'https://pumba.dcsr.unil.ch/lanes/%s',\n",
+ " 'category': 'Proteomic databases',\n",
+ " 'statistics': {'reviewedProteinCount': 18203, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'Protein Data Bank RCSB',\n",
+ " 'id': 'DB-0171',\n",
+ " 'abbrev': 'RCSB-PDB',\n",
+ " 'pubMedId': '33211854',\n",
+ " 'doiId': '10.1093/nar/gkaa1038',\n",
+ " 'linkType': 'Implicit',\n",
+ " 'servers': ['https://www.rcsb.org/'],\n",
+ " 'dbUrl': 'https://www.rcsb.org/structure/%s',\n",
+ " 'category': '3D structure databases',\n",
+ " 'statistics': {'reviewedProteinCount': 0, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'Restriction enzymes and methylases database',\n",
+ " 'id': 'DB-0089',\n",
+ " 'abbrev': 'REBASE',\n",
+ " 'pubMedId': '36318248',\n",
+ " 'doiId': '10.1093/nar/gkac975',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://rebase.neb.com/rebase/rebase.html'],\n",
+ " 'dbUrl': 'https://rebase.neb.com/rebase/enz/%s.html',\n",
+ " 'category': 'Protein family/group databases',\n",
+ " 'statistics': {'reviewedProteinCount': 395,\n",
+ " 'unreviewedProteinCount': 69684}},\n",
+ " {'name': 'REPRODUCTION-2DPAGE',\n",
+ " 'id': 'DB-0090',\n",
+ " 'abbrev': 'REPRODUCTION-2DPAGE',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['http://reprod.njmu.edu.cn/cgi-bin/2d/2d.cgi'],\n",
+ " 'dbUrl': 'http://reprod.njmu.edu.cn/cgi-bin/2d/2d.cgi?%s',\n",
+ " 'category': '2D gel databases',\n",
+ " 'statistics': {'reviewedProteinCount': 1039, 'unreviewedProteinCount': 59}},\n",
+ " {'name': 'Rat Genome Database',\n",
+ " 'id': 'DB-0091',\n",
+ " 'abbrev': 'RGD',\n",
+ " 'pubMedId': '31713623',\n",
+ " 'doiId': '10.1093/nar/gkz1041',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://rgd.mcw.edu/'],\n",
+ " 'dbUrl': 'https://rgd.mcw.edu/rgdweb/report/gene/main.html?id=%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 8158,\n",
+ " 'unreviewedProteinCount': 65945}},\n",
+ " {'name': 'RNAct, Protein-RNA interaction predictions for model organisms.',\n",
+ " 'id': 'DB-0246',\n",
+ " 'abbrev': 'RNAct',\n",
+ " 'pubMedId': '30445601',\n",
+ " 'doiId': '10.1093/nar/gky967',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://rnact.tartaglialab.com/'],\n",
+ " 'dbUrl': 'https://rnact.tartaglialab.com/protein?query=%u',\n",
+ " 'category': 'Miscellaneous databases',\n",
+ " 'statistics': {'reviewedProteinCount': 43120,\n",
+ " 'unreviewedProteinCount': 2439}},\n",
+ " {'name': 'Reactome - a knowledgebase of biological pathways and processes',\n",
+ " 'id': 'DB-0088',\n",
+ " 'abbrev': 'Reactome',\n",
+ " 'pubMedId': '31691815',\n",
+ " 'doiId': '10.1093/nar/gkz1031',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://reactome.org'],\n",
+ " 'dbUrl': 'https://www.reactome.org/PathwayBrowser/#%s&FLG=%u',\n",
+ " 'category': 'Enzyme and pathway databases',\n",
+ " 'statistics': {'reviewedProteinCount': 39443,\n",
+ " 'unreviewedProteinCount': 42886}},\n",
+ " {'name': 'NCBI Reference Sequences',\n",
+ " 'id': 'DB-0117',\n",
+ " 'abbrev': 'RefSeq',\n",
+ " 'pubMedId': '26553804',\n",
+ " 'doiId': '10.1093/nar/gkv1189',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.ncbi.nlm.nih.gov/refseq/'],\n",
+ " 'dbUrl': 'https://www.ncbi.nlm.nih.gov/protein/%s',\n",
+ " 'category': 'Sequence databases',\n",
+ " 'statistics': {'reviewedProteinCount': 442362,\n",
+ " 'unreviewedProteinCount': 98371180}},\n",
+ " {'name': 'Rodent Unidentified Gene-Encoded large proteins database',\n",
+ " 'id': 'DB-0092',\n",
+ " 'abbrev': 'Rouge',\n",
+ " 'pubMedId': '14681467',\n",
+ " 'doiId': '10.1093/nar/gkh035',\n",
+ " 'linkType': 'Implicit',\n",
+ " 'servers': ['http://www.kazusa.or.jp/rouge/'],\n",
+ " 'dbUrl': 'http://www.kazusa.or.jp/rouge/gfpage/%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 0, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'SABIO-RK',\n",
+ " 'id': 'DB-0177',\n",
+ " 'abbrev': 'SABIO-RK',\n",
+ " 'pubMedId': '29092055',\n",
+ " 'doiId': '10.1093/nar/gkx1065',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://sabiork.h-its.org/'],\n",
+ " 'dbUrl': 'https://sabiork.h-its.org/newSearch?q=UniProtKB_AC:%u',\n",
+ " 'category': 'Enzyme and pathway databases',\n",
+ " 'statistics': {'reviewedProteinCount': 5951, 'unreviewedProteinCount': 833}},\n",
+ " {'name': 'Small Angle Scattering Biological Data Bank',\n",
+ " 'id': 'DB-0258',\n",
+ " 'abbrev': 'SASBDB',\n",
+ " 'pubMedId': '31576635',\n",
+ " 'doiId': '10.1002/pro.3731',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.sasbdb.org/'],\n",
+ " 'dbUrl': 'https://www.sasbdb.org/uniprot/%u/',\n",
+ " 'category': '3D structure databases',\n",
+ " 'statistics': {'reviewedProteinCount': 1025, 'unreviewedProteinCount': 321}},\n",
+ " {'name': 'Structure-Function Linkage Database',\n",
+ " 'id': 'DB-0220',\n",
+ " 'abbrev': 'SFLD',\n",
+ " 'pubMedId': '24271399',\n",
+ " 'doiId': '10.1093/nar/gkt1130',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['http://sfld.rbvi.ucsf.edu/archive/django/index.html',\n",
+ " 'https://www.ebi.ac.uk/interpro/entry/sfld/#table'],\n",
+ " 'dbUrl': 'https://www.ebi.ac.uk/interpro/entry/sfld/%s',\n",
+ " 'category': 'Family and domain databases',\n",
+ " 'statistics': {'reviewedProteinCount': 9138,\n",
+ " 'unreviewedProteinCount': 1695453}},\n",
+ " {'name': 'Saccharomyces Genome Database',\n",
+ " 'id': 'DB-0095',\n",
+ " 'abbrev': 'SGD',\n",
+ " 'pubMedId': '39530598',\n",
+ " 'doiId': '10.1093/genetics/iyae185',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.yeastgenome.org/'],\n",
+ " 'dbUrl': 'https://www.yeastgenome.org/locus/%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 6748, 'unreviewedProteinCount': 4}},\n",
+ " {'name': 'SIGNOR Signaling Network Open Resource',\n",
+ " 'id': 'DB-0206',\n",
+ " 'abbrev': 'SIGNOR',\n",
+ " 'pubMedId': '36243968',\n",
+ " 'doiId': '10.1093/nar/gkac883',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://signor.uniroma2.it/'],\n",
+ " 'dbUrl': 'https://signor.uniroma2.it/relation_result.php?id=%u',\n",
+ " 'category': 'Enzyme and pathway databases',\n",
+ " 'statistics': {'reviewedProteinCount': 7769, 'unreviewedProteinCount': 5}},\n",
+ " {'name': 'Simple Modular Architecture Research Tool; a protein domain database',\n",
+ " 'id': 'DB-0097',\n",
+ " 'abbrev': 'SMART',\n",
+ " 'pubMedId': '41062452',\n",
+ " 'doiId': '10.1093/nar/gkaf1023',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://smart.embl.de/'],\n",
+ " 'dbUrl': 'https://smart.embl.de/smart/do_annotation.pl?DOMAIN=%s',\n",
+ " 'category': 'Family and domain databases',\n",
+ " 'statistics': {'reviewedProteinCount': 149377,\n",
+ " 'unreviewedProteinCount': 44304299}},\n",
+ " {'name': 'SWISS-MODEL Repository - a database of annotated 3D protein structure models',\n",
+ " 'id': 'DB-0098',\n",
+ " 'abbrev': 'SMR',\n",
+ " 'pubMedId': '27899672',\n",
+ " 'doiId': '10.1093/nar/gkw1132',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://swissmodel.expasy.org/repository/'],\n",
+ " 'dbUrl': 'https://swissmodel.expasy.org/repository/uniprot/%s?csm=%d',\n",
+ " 'category': '3D structure databases',\n",
+ " 'statistics': {'reviewedProteinCount': 525917,\n",
+ " 'unreviewedProteinCount': 3789969}},\n",
+ " {'name': 'STRENDA database of Standards for Reporting Enzymology Data',\n",
+ " 'id': 'DB-0278',\n",
+ " 'abbrev': 'STRENDA-DB',\n",
+ " 'pubMedId': '29498804',\n",
+ " 'doiId': '10.1111/febs.14427',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.beilstein-strenda-db.org/strenda/index.xhtml'],\n",
+ " 'dbUrl': 'https://beilstein-strenda-db.org/strenda/public/doiQuery.xhtml?doi=10.22011/strenda_db.%s',\n",
+ " 'category': 'Enzyme and pathway databases',\n",
+ " 'statistics': {'reviewedProteinCount': 45, 'unreviewedProteinCount': 9}},\n",
+ " {'name': 'STRING',\n",
+ " 'id': 'DB-0141',\n",
+ " 'abbrev': 'STRING',\n",
+ " 'pubMedId': '36370105',\n",
+ " 'doiId': '10.1093/nar/gkac1000',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://string-db.org/'],\n",
+ " 'dbUrl': 'https://string-db.org/network/%s',\n",
+ " 'category': 'Protein-protein interaction databases',\n",
+ " 'statistics': {'reviewedProteinCount': 337189,\n",
+ " 'unreviewedProteinCount': 27452324}},\n",
+ " {'name': 'Superfamily database of structural and functional annotation',\n",
+ " 'id': 'DB-0155',\n",
+ " 'abbrev': 'SUPFAM',\n",
+ " 'pubMedId': '30445555',\n",
+ " 'doiId': '10.1093/nar/gky1130',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://supfam.org'],\n",
+ " 'dbUrl': 'https://supfam.org/SUPERFAMILY/cgi-bin/scop.cgi?ipid=%s',\n",
+ " 'category': 'Family and domain databases',\n",
+ " 'statistics': {'reviewedProteinCount': 461938,\n",
+ " 'unreviewedProteinCount': 124386470}},\n",
+ " {'name': 'SWISS-MODEL Interactive Workspace',\n",
+ " 'id': 'DB-0234',\n",
+ " 'abbrev': 'SWISS-MODEL-Workspace',\n",
+ " 'pubMedId': '29788355',\n",
+ " 'doiId': '10.1093/nar/gky427',\n",
+ " 'linkType': 'Implicit',\n",
+ " 'servers': ['https://swissmodel.expasy.org/'],\n",
+ " 'dbUrl': 'https://swissmodel.expasy.org/interactive/?ac=%u',\n",
+ " 'category': '3D structure databases',\n",
+ " 'statistics': {'reviewedProteinCount': 0, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'SignaLink',\n",
+ " 'id': 'DB-0179',\n",
+ " 'abbrev': 'SignaLink',\n",
+ " 'pubMedId': '34634810',\n",
+ " 'doiId': '10.1093/nar/gkab909',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['http://signalink.org/'],\n",
+ " 'dbUrl': 'http://signalink.org/node/%u',\n",
+ " 'category': 'Enzyme and pathway databases',\n",
+ " 'statistics': {'reviewedProteinCount': 19948, 'unreviewedProteinCount': 29}},\n",
+ " {'name': 'SwissLipids knowledge resource for lipid biology',\n",
+ " 'id': 'DB-0197',\n",
+ " 'abbrev': 'SwissLipids',\n",
+ " 'pubMedId': '25943471',\n",
+ " 'doiId': '10.1093/bioinformatics/btv285',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.swisslipids.org'],\n",
+ " 'dbUrl': 'https://www.swisslipids.org/#/entity/%s/',\n",
+ " 'category': 'Chemistry databases',\n",
+ " 'statistics': {'reviewedProteinCount': 1394, 'unreviewedProteinCount': 4}},\n",
+ " {'name': 'SwissPalm database of S-palmitoylation events',\n",
+ " 'id': 'DB-0201',\n",
+ " 'abbrev': 'SwissPalm',\n",
+ " 'pubMedId': '26339475',\n",
+ " 'doiId': '10.12688/f1000research.6464.1',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://swisspalm.org'],\n",
+ " 'dbUrl': 'https://swisspalm.org/proteins/%u',\n",
+ " 'category': 'PTM databases',\n",
+ " 'statistics': {'reviewedProteinCount': 14331,\n",
+ " 'unreviewedProteinCount': 7170}},\n",
+ " {'name': 'The Arabidopsis Information Resource',\n",
+ " 'id': 'DB-0102',\n",
+ " 'abbrev': 'TAIR',\n",
+ " 'pubMedId': '26201819',\n",
+ " 'doiId': '10.1002/dvg.22877',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.arabidopsis.org/'],\n",
+ " 'dbUrl': 'https://www.arabidopsis.org/locus?name=%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 16342,\n",
+ " 'unreviewedProteinCount': 26470}},\n",
+ " {'name': 'Transport Classification Database',\n",
+ " 'id': 'DB-0135',\n",
+ " 'abbrev': 'TCDB',\n",
+ " 'pubMedId': '33170213',\n",
+ " 'doiId': '10.1093/nar/gkaa1004',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.tcdb.org/'],\n",
+ " 'dbUrl': 'https://www.tcdb.org/search/result.php?tc=%s',\n",
+ " 'category': 'Protein family/group databases',\n",
+ " 'statistics': {'reviewedProteinCount': 8723,\n",
+ " 'unreviewedProteinCount': 8303}},\n",
+ " {'name': 'Consortium for Top Down Proteomics',\n",
+ " 'id': 'DB-0204',\n",
+ " 'abbrev': 'TopDownProteomics',\n",
+ " 'pubMedId': '24644084',\n",
+ " 'doiId': '10.1002/pmic.201300438',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['http://repository.topdownproteomics.org/'],\n",
+ " 'dbUrl': 'http://repository.topdownproteomics.org/Proteoforms?query=%u',\n",
+ " 'category': 'Proteomic databases',\n",
+ " 'statistics': {'reviewedProteinCount': 2957, 'unreviewedProteinCount': 265}},\n",
+ " {'name': 'Mycobacterium tuberculosis strain H37Rv genome database',\n",
+ " 'id': 'DB-0106',\n",
+ " 'abbrev': 'TubercuList',\n",
+ " 'pubMedId': '20980200',\n",
+ " 'doiId': '10.1016/j.tube.2010.09.006',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://mycobrowser.epfl.ch/'],\n",
+ " 'dbUrl': 'https://mycobrowser.epfl.ch/genes/%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 2323, 'unreviewedProteinCount': 904}},\n",
+ " {'name': 'UCSC genome browser',\n",
+ " 'id': 'DB-0139',\n",
+ " 'abbrev': 'UCSC',\n",
+ " 'pubMedId': '39460617',\n",
+ " 'doiId': '10.1093/nar/gkae974',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://genome.ucsc.edu/'],\n",
+ " 'dbUrl': 'https://genome.ucsc.edu/cgi-bin/hgLinkIn?resource=uniprot&id=%u',\n",
+ " 'category': 'Genome annotation databases',\n",
+ " 'statistics': {'reviewedProteinCount': 46623,\n",
+ " 'unreviewedProteinCount': 87325}},\n",
+ " {'name': 'UniLectin database of carbohydrate-binding proteins',\n",
+ " 'id': 'DB-0231',\n",
+ " 'abbrev': 'UniLectin',\n",
+ " 'pubMedId': '30239928',\n",
+ " 'doiId': '10.1093/nar/gky832',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://unilectin.unige.ch/'],\n",
+ " 'dbUrl': 'https://unilectin.unige.ch/unilectin3D/display_lectin?uniprot=%u',\n",
+ " 'category': 'Protein family/group databases',\n",
+ " 'statistics': {'reviewedProteinCount': 367, 'unreviewedProteinCount': 232}},\n",
+ " {'name': 'UniPathway',\n",
+ " 'id': 'DB-0170',\n",
+ " 'abbrev': 'UniPathway',\n",
+ " 'pubMedId': '22102589',\n",
+ " 'doiId': '10.1093/nar/gkr1023',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['http://www.unipathway.org'],\n",
+ " 'dbUrl': 'http://www.unipathway.org?upid=%s&entryac=%u',\n",
+ " 'category': 'Enzyme and pathway databases',\n",
+ " 'statistics': {'reviewedProteinCount': 126895,\n",
+ " 'unreviewedProteinCount': 7933515}},\n",
+ " {'name': 'Eukaryotic Pathogen, Vector and Host Database Resources',\n",
+ " 'id': 'DB-0153',\n",
+ " 'abbrev': 'VEuPathDB',\n",
+ " 'pubMedId': '27903906',\n",
+ " 'doiId': '10.1093/nar/gkw1105',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://veupathdb.org/veupathdb/app'],\n",
+ " 'dbUrl': 'https://www.veupathdb.org/gene/%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 80328,\n",
+ " 'unreviewedProteinCount': 5134047}},\n",
+ " {'name': 'Vertebrate Gene Nomenclature Database',\n",
+ " 'id': 'DB-0226',\n",
+ " 'abbrev': 'VGNC',\n",
+ " 'pubMedId': '33152070',\n",
+ " 'doiId': '10.1093/nar/gkaa980',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://vertebrate.genenames.org/'],\n",
+ " 'dbUrl': 'https://vertebrate.genenames.org/data/gene-symbol-report/#!/vgnc_id/%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 5143,\n",
+ " 'unreviewedProteinCount': 175413}},\n",
+ " {'name': 'WormBase ParaSite',\n",
+ " 'id': 'DB-0195',\n",
+ " 'abbrev': 'WBParaSite',\n",
+ " 'pubMedId': '26578572',\n",
+ " 'doiId': '10.1093/nar/gkv1217',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://parasite.wormbase.org'],\n",
+ " 'dbUrl': 'https://parasite.wormbase.org/id/%s',\n",
+ " 'category': 'Genome annotation databases',\n",
+ " 'statistics': {'reviewedProteinCount': 54,\n",
+ " 'unreviewedProteinCount': 1385127}},\n",
+ " {'name': 'WormBase',\n",
+ " 'id': 'DB-0110',\n",
+ " 'abbrev': 'WormBase',\n",
+ " 'pubMedId': '31642470',\n",
+ " 'doiId': '10.1093/nar/gkz920',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://wormbase.org/'],\n",
+ " 'dbUrl': 'https://wormbase.org/db/seq/protein?name=%s;class=CDS',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 5108,\n",
+ " 'unreviewedProteinCount': 61562}},\n",
+ " {'name': 'Xenopus laevis and tropicalis biology and genomics resource',\n",
+ " 'id': 'DB-0129',\n",
+ " 'abbrev': 'Xenbase',\n",
+ " 'pubMedId': '36755307',\n",
+ " 'doiId': '10.1093/genetics/iyad018',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.xenbase.org/xenbase/'],\n",
+ " 'dbUrl': 'https://www.xenbase.org/gene/showgene.do?method=display&geneId=%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 4758,\n",
+ " 'unreviewedProteinCount': 87300}},\n",
+ " {'name': 'YCharOS',\n",
+ " 'id': 'DB-0276',\n",
+ " 'abbrev': 'YCharOS',\n",
+ " 'pubMedId': '39506148',\n",
+ " 'doiId': '10.1038/d41586-024-03590-0',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://f1000research.com/gateways/ycharos'],\n",
+ " 'dbUrl': 'https://f1000research.com/gateways/ycharos?selectedDomain=&n0=text&o0=&v0=%u',\n",
+ " 'category': 'Protocols and materials databases',\n",
+ " 'statistics': {'reviewedProteinCount': 36, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'Zebrafish Information Network genome database',\n",
+ " 'id': 'DB-0113',\n",
+ " 'abbrev': 'ZFIN',\n",
+ " 'pubMedId': '35166825',\n",
+ " 'doiId': '10.1093/genetics/iyac016',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://zfin.org/'],\n",
+ " 'dbUrl': 'https://zfin.org/%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 3295,\n",
+ " 'unreviewedProteinCount': 35408}},\n",
+ " {'name': 'Database of single nucleotide polymorphism',\n",
+ " 'id': 'DB-0013',\n",
+ " 'abbrev': 'dbSNP',\n",
+ " 'pubMedId': '33095870',\n",
+ " 'doiId': '10.1093/nar/gkaa892',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://www.ncbi.nlm.nih.gov/snp/'],\n",
+ " 'dbUrl': 'https://www.ncbi.nlm.nih.gov/snp/%s',\n",
+ " 'category': 'Genetic variation databases',\n",
+ " 'statistics': {'reviewedProteinCount': 0, 'unreviewedProteinCount': 0}},\n",
+ " {'name': 'Dictyostelium discoideum online informatics resource',\n",
+ " 'id': 'DB-0015',\n",
+ " 'abbrev': 'dictyBase',\n",
+ " 'pubMedId': '23172289',\n",
+ " 'doiId': '10.1093/nar/gks1064',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['http://dictybase.org/'],\n",
+ " 'dbUrl': 'http://dictybase.org/db/cgi-bin/gene_page.pl?primary_id=%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 4114,\n",
+ " 'unreviewedProteinCount': 7750}},\n",
+ " {'name': 'evolutionary genealogy of genes',\n",
+ " 'id': 'DB-0152',\n",
+ " 'abbrev': 'eggNOG',\n",
+ " 'pubMedId': '36399505',\n",
+ " 'doiId': '10.1093/nar/gkac1022',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['http://eggnogdb.embl.de/'],\n",
+ " 'dbUrl': 'http://eggnog.embl.de/search/ogs/%s',\n",
+ " 'category': 'Phylogenomic databases',\n",
+ " 'statistics': {'reviewedProteinCount': 334622,\n",
+ " 'unreviewedProteinCount': 10426550}},\n",
+ " {'name': 'European Hepatitis C Virus Database',\n",
+ " 'id': 'DB-0025',\n",
+ " 'abbrev': 'euHCVdb',\n",
+ " 'pubMedId': '17142229',\n",
+ " 'doiId': '10.1093/nar/gkl970',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://euhcvdb.lyon.inserm.fr/euHCVdb/jsp/index.jsp'],\n",
+ " 'dbUrl': 'https://euhcvdb.lyon.inserm.fr/euHCVdb/do/displayHCVEntry?primaryAC=%s',\n",
+ " 'category': 'Organism-specific databases',\n",
+ " 'statistics': {'reviewedProteinCount': 44, 'unreviewedProteinCount': 75264}},\n",
+ " {'name': 'iPTMnet integrated resource for PTMs in systems biology context',\n",
+ " 'id': 'DB-0200',\n",
+ " 'abbrev': 'iPTMnet',\n",
+ " 'pubMedId': '29145615',\n",
+ " 'doiId': '10.1093/nar/gkx1104',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://research.bioinformatics.udel.edu/iptmnet/'],\n",
+ " 'dbUrl': 'https://research.bioinformatics.udel.edu/iptmnet/entry/%s',\n",
+ " 'category': 'PTM databases',\n",
+ " 'statistics': {'reviewedProteinCount': 56791,\n",
+ " 'unreviewedProteinCount': 10131}},\n",
+ " {'name': 'jPOST - Japan Proteome Standard Repository/Database',\n",
+ " 'id': 'DB-0233',\n",
+ " 'abbrev': 'jPOST',\n",
+ " 'pubMedId': '27899654',\n",
+ " 'doiId': '10.1093/nar/gkw1080',\n",
+ " 'linkType': 'Explicit',\n",
+ " 'servers': ['https://globe.jpostdb.org/'],\n",
+ " 'dbUrl': 'https://globe.jpostdb.org/protein?id=%u',\n",
+ " 'category': 'Proteomic databases',\n",
+ " 'statistics': {'reviewedProteinCount': 29053,\n",
+ " 'unreviewedProteinCount': 25353}}]"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "params = {\"format\": \"json\", \"query\": \"*\", \"size\": 500}\n",
+ "response = requests.get(\"https://rest.uniprot.org/database/search\", params=params)\n",
+ "response.raise_for_status()\n",
+ "registry_data = response.json()\n",
+ "uniprot_dblist = registry_data[\"results\"]\n",
+ "uniprot_dblist"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "e0c21a0d-a1c2-435c-8849-274049ae023d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Official UniProt name count: 185\n",
+ "Official UniProt abbrev count: 185\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "{'abcd',\n",
+ " 'agora',\n",
+ " 'agr',\n",
+ " 'allergome',\n",
+ " 'alphafolddb',\n",
+ " 'antibodypedia',\n",
+ " 'antifam',\n",
+ " 'arachnoserver',\n",
+ " 'araport',\n",
+ " 'bgee',\n",
+ " 'bindingdb',\n",
+ " 'biocyc',\n",
+ " 'biogrid',\n",
+ " 'biogrid-orcs',\n",
+ " 'biomuta',\n",
+ " 'bmrb',\n",
+ " 'brenda',\n",
+ " 'carbonyldb',\n",
+ " 'card',\n",
+ " 'cazy',\n",
+ " 'ccds',\n",
+ " 'cd-code',\n",
+ " 'cdd',\n",
+ " 'cgd',\n",
+ " 'chembl',\n",
+ " 'chitars',\n",
+ " 'civic',\n",
+ " 'clingen',\n",
+ " 'clinpgx',\n",
+ " 'collectf',\n",
+ " 'complexportal',\n",
+ " 'conoserver',\n",
+ " 'corum',\n",
+ " 'cptac',\n",
+ " 'cptc',\n",
+ " 'ctd',\n",
+ " 'dbsnp',\n",
+ " 'ddbj',\n",
+ " 'depod',\n",
+ " 'dictybase',\n",
+ " 'dip',\n",
+ " 'disgenet',\n",
+ " 'disprot',\n",
+ " 'dmdm',\n",
+ " 'dnasu',\n",
+ " 'drugbank',\n",
+ " 'drugcentral',\n",
+ " 'echobase',\n",
+ " 'eggnog',\n",
+ " 'elm',\n",
+ " 'embl',\n",
+ " 'emdb',\n",
+ " 'ensembl',\n",
+ " 'ensemblbacteria',\n",
+ " 'ensemblfungi',\n",
+ " 'ensemblmetazoa',\n",
+ " 'ensemblplants',\n",
+ " 'ensemblprotists',\n",
+ " 'enzyme',\n",
+ " 'esther',\n",
+ " 'euhcvdb',\n",
+ " 'evolutionarytrace',\n",
+ " 'expressionatlas',\n",
+ " 'flybase',\n",
+ " 'funcoup',\n",
+ " 'funfam',\n",
+ " 'genatlas',\n",
+ " 'genbank',\n",
+ " 'gencc',\n",
+ " 'gene3d',\n",
+ " 'genecards',\n",
+ " 'geneid',\n",
+ " 'genereviews',\n",
+ " 'genetree',\n",
+ " 'genewiki',\n",
+ " 'genomernai',\n",
+ " 'glyconnect',\n",
+ " 'glycosmos',\n",
+ " 'glygen',\n",
+ " 'go',\n",
+ " 'gpcrdb',\n",
+ " 'gramene',\n",
+ " 'guidetopharmacology',\n",
+ " 'hamap',\n",
+ " 'hgnc',\n",
+ " 'hogenom',\n",
+ " 'hpa',\n",
+ " 'huge',\n",
+ " 'ideal',\n",
+ " 'imgt_gene-db',\n",
+ " 'inparanoid',\n",
+ " 'intact',\n",
+ " 'interpro',\n",
+ " 'iptmnet',\n",
+ " 'japonicusdb',\n",
+ " 'jpost',\n",
+ " 'kegg',\n",
+ " 'legiolist',\n",
+ " 'leproma',\n",
+ " 'maizegdb',\n",
+ " 'malacards',\n",
+ " 'mane-select',\n",
+ " 'massive',\n",
+ " 'merops',\n",
+ " 'metosite',\n",
+ " 'mgi',\n",
+ " 'mim',\n",
+ " 'mint',\n",
+ " 'mobidb',\n",
+ " 'modbase',\n",
+ " 'moondb',\n",
+ " 'moonprot',\n",
+ " 'ncbifam',\n",
+ " 'niagads',\n",
+ " 'ogp',\n",
+ " 'oma',\n",
+ " 'opentargets',\n",
+ " 'orphanet',\n",
+ " 'orthodb',\n",
+ " 'pan-go',\n",
+ " 'panther',\n",
+ " 'pathwaycommons',\n",
+ " 'patric',\n",
+ " 'paxdb',\n",
+ " 'pcddb',\n",
+ " 'pdb',\n",
+ " 'pdbe-kb',\n",
+ " 'pdbj',\n",
+ " 'pdbsum',\n",
+ " 'peptideatlas',\n",
+ " 'peroxibase',\n",
+ " 'pfam',\n",
+ " 'pharos',\n",
+ " 'phi-base',\n",
+ " 'phosphositeplus',\n",
+ " 'phylomedb',\n",
+ " 'pir',\n",
+ " 'pirsf',\n",
+ " 'plantreactome',\n",
+ " 'pombase',\n",
+ " 'pride',\n",
+ " 'prints',\n",
+ " 'pro',\n",
+ " 'promex',\n",
+ " 'prosite',\n",
+ " 'proteomes',\n",
+ " 'proteomicsdb',\n",
+ " 'pseudocap',\n",
+ " 'pumba',\n",
+ " 'rcsb-pdb',\n",
+ " 'reactome',\n",
+ " 'rebase',\n",
+ " 'refseq',\n",
+ " 'reproduction-2dpage',\n",
+ " 'rgd',\n",
+ " 'rnact',\n",
+ " 'rouge',\n",
+ " 'sabio-rk',\n",
+ " 'sasbdb',\n",
+ " 'sfld',\n",
+ " 'sgd',\n",
+ " 'signalink',\n",
+ " 'signor',\n",
+ " 'smart',\n",
+ " 'smr',\n",
+ " 'strenda-db',\n",
+ " 'string',\n",
+ " 'supfam',\n",
+ " 'swiss-model-workspace',\n",
+ " 'swisslipids',\n",
+ " 'swisspalm',\n",
+ " 'tair',\n",
+ " 'tcdb',\n",
+ " 'topdownproteomics',\n",
+ " 'tuberculist',\n",
+ " 'ucsc',\n",
+ " 'unilectin',\n",
+ " 'unipathway',\n",
+ " 'veupathdb',\n",
+ " 'vgnc',\n",
+ " 'wbparasite',\n",
+ " 'wormbase',\n",
+ " 'xenbase',\n",
+ " 'ycharos',\n",
+ " 'zfin'}"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "uniprot_dblist_db_names = {\n",
+ " entry[\"name\"].strip().lower() for entry in registry_data[\"results\"] if isinstance(entry, dict) and entry.get(\"name\")\n",
+ "}\n",
+ "\n",
+ "print(\"Official UniProt name count:\", len(uniprot_dblist_db_names))\n",
+ "\n",
+ "uniprot_dblist_prefixes = {\n",
+ " entry[\"abbrev\"].strip().lower()\n",
+ " for entry in registry_data[\"results\"]\n",
+ " if isinstance(entry, dict) and entry.get(\"abbrev\")\n",
+ "}\n",
+ "\n",
+ "print(\"Official UniProt abbrev count:\", len(uniprot_dblist_prefixes))\n",
+ "uniprot_dblist_prefixes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a45daf4c-a03c-40c9-ad5f-28e07b80f4ed",
+ "metadata": {},
+ "source": [
+ "## Load ID mapping prefixes.txt\n",
+ "\n",
+ "List of all unique prefixes that appear in the UniProt ID mapping data file: \n",
+ "[idmapping.dat.gz](https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "be826bcd-15c6-4a84-8938-be0365cd155a",
+ "metadata": {
+ "collapsed": true,
+ "jupyter": {
+ "outputs_hidden": true
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "UniProt ID mapping prefixes: 103\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "{'allergome',\n",
+ " 'arachnoserver',\n",
+ " 'araport',\n",
+ " 'biocyc',\n",
+ " 'biogrid',\n",
+ " 'biomuta',\n",
+ " 'ccds',\n",
+ " 'cgd',\n",
+ " 'chembl',\n",
+ " 'chitars',\n",
+ " 'collectf',\n",
+ " 'complexportal',\n",
+ " 'conoserver',\n",
+ " 'cptac',\n",
+ " 'crc64',\n",
+ " 'dictybase',\n",
+ " 'dip',\n",
+ " 'disprot',\n",
+ " 'dmdm',\n",
+ " 'dnasu',\n",
+ " 'drugbank',\n",
+ " 'echobase',\n",
+ " 'eggnog',\n",
+ " 'embl',\n",
+ " 'embl-cds',\n",
+ " 'emdb',\n",
+ " 'ensembl',\n",
+ " 'ensembl_pro',\n",
+ " 'ensembl_trs',\n",
+ " 'ensemblgenome',\n",
+ " 'ensemblgenome_pro',\n",
+ " 'ensemblgenome_trs',\n",
+ " 'esther',\n",
+ " 'euhcvdb',\n",
+ " 'flybase',\n",
+ " 'gene_name',\n",
+ " 'gene_orderedlocusname',\n",
+ " 'gene_orfname',\n",
+ " 'gene_synonym',\n",
+ " 'genecards',\n",
+ " 'geneid',\n",
+ " 'genereviews',\n",
+ " 'genetree',\n",
+ " 'genewiki',\n",
+ " 'genomernai',\n",
+ " 'gi',\n",
+ " 'glyconnect',\n",
+ " 'guidetopharmacology',\n",
+ " 'hgnc',\n",
+ " 'hogenom',\n",
+ " 'ideal',\n",
+ " 'japonicusdb',\n",
+ " 'kegg',\n",
+ " 'legiolist',\n",
+ " 'leproma',\n",
+ " 'maizegdb',\n",
+ " 'merops',\n",
+ " 'mgi',\n",
+ " 'mim',\n",
+ " 'mint',\n",
+ " 'ncbi_taxid',\n",
+ " 'nextprot',\n",
+ " 'oma',\n",
+ " 'opentargets',\n",
+ " 'orphanet',\n",
+ " 'orthodb',\n",
+ " 'patric',\n",
+ " 'pdb',\n",
+ " 'peroxibase',\n",
+ " 'pharmgkb',\n",
+ " 'phi-base',\n",
+ " 'plantreactome',\n",
+ " 'pombase',\n",
+ " 'proteomicsdb',\n",
+ " 'pseudocap',\n",
+ " 'reactome',\n",
+ " 'rebase',\n",
+ " 'refseq',\n",
+ " 'refseq_nt',\n",
+ " 'rgd',\n",
+ " 'sgd',\n",
+ " 'string',\n",
+ " 'swisslipids',\n",
+ " 'tair',\n",
+ " 'tcdb',\n",
+ " 'treefam',\n",
+ " 'tuberculist',\n",
+ " 'ucsc',\n",
+ " 'uniparc',\n",
+ " 'unipathway',\n",
+ " 'uniprotkb-id',\n",
+ " 'uniref100',\n",
+ " 'uniref50',\n",
+ " 'uniref90',\n",
+ " 'veupathdb',\n",
+ " 'vgnc',\n",
+ " 'wbparasite',\n",
+ " 'wbparasite_trs_pro',\n",
+ " 'wormbase',\n",
+ " 'wormbase_pro',\n",
+ " 'wormbase_trs',\n",
+ " 'xenbase',\n",
+ " 'zfin'}"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ID_MAPPING_PREFIXES = Path(\"data/prefixes.txt\")\n",
+ "idmapping_prefixes = {line.strip().lower() for line in ID_MAPPING_PREFIXES.read_text().split(\"\\n\") if line.strip()}\n",
+ "print(\"UniProt ID mapping prefixes:\", len(idmapping_prefixes))\n",
+ "idmapping_prefixes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "ec0aedd7-2aa8-4450-a4c1-1343052772c3",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "25\n",
+ "['crc64', 'embl-cds', 'ensembl_pro', 'ensembl_trs', 'ensemblgenome', 'ensemblgenome_pro', 'ensemblgenome_trs', 'gene_name', 'gene_orderedlocusname', 'gene_orfname', 'gene_synonym', 'gi', 'ncbi_taxid', 'nextprot', 'pharmgkb', 'refseq_nt', 'treefam', 'uniparc', 'uniprotkb-id', 'uniref100', 'uniref50', 'uniref90', 'wbparasite_trs_pro', 'wormbase_pro', 'wormbase_trs']\n"
+ ]
+ }
+ ],
+ "source": [
+ "## Prefixes in the ID mapping file but not in UniProt official list\n",
+ "idmapping_not_in_dblist = idmapping_prefixes - uniprot_dblist_prefixes\n",
+ "print(len(idmapping_not_in_dblist))\n",
+ "print(sorted(idmapping_not_in_dblist))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7fb42f50-557e-42dc-95d8-a30c7eb1d690",
+ "metadata": {},
+ "source": [
+ "### Classification of ID Mapping Prefixes Not Present in UniProt Official Cross-Reference Registry\n",
+ "\n",
+ "The following prefixes appear in the ID mapping-derived set but are not listed in the UniProt official cross-reference registry.\n",
+ "\n",
+ "They fall into several categories:\n",
+ "\n",
+ "1.\tInternal UniProt metadata\n",
+ "2.\tSubtype mappings \n",
+ "3.\tExternal biological databases\n",
+ "4.\tDeprecated or taxonomy identifiers\n",
+ "5.\tUniProt-derived resources"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "55ebe5f3-4396-4043-9d19-b0599745eb6c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "internal_metadata (6):\n",
+ "['crc64', 'gene_name', 'gene_orderedlocusname', 'gene_orfname', 'gene_synonym', 'uniprotkb-id']\n",
+ "\n",
+ "subtype_mapping (9):\n",
+ "['embl-cds', 'ensembl_pro', 'ensembl_trs', 'ensemblgenome_pro', 'ensemblgenome_trs', 'refseq_nt', 'wbparasite_trs_pro', 'wormbase_pro', 'wormbase_trs']\n",
+ "\n",
+ "external_database_candidate (4):\n",
+ "['ensemblgenome', 'nextprot', 'pharmgkb', 'treefam']\n",
+ "\n",
+ "deprecated_identifier (1):\n",
+ "['gi']\n",
+ "\n",
+ "taxonomy_identifier (1):\n",
+ "['ncbi_taxid']\n",
+ "\n",
+ "uniprot_derived_db (4):\n",
+ "['uniparc', 'uniref100', 'uniref50', 'uniref90']\n"
+ ]
+ }
+ ],
+ "source": [
+ "## create a dictionary with empty lists\n",
+ "classification = defaultdict(list)\n",
+ "\n",
+ "SUBTYPE_MAPPING = {\n",
+ " \"embl-cds\", # EMBL CDS subtype\n",
+ " \"refseq_nt\", # RefSeq nucleotide subtype\n",
+ "}\n",
+ "\n",
+ "for p in sorted(idmapping_not_in_dblist):\n",
+ " # internal annotation fields\n",
+ " if p.startswith(\"gene_\") or p in {\"crc64\", \"uniprotkb-id\"}:\n",
+ " classification[\"internal_metadata\"].append(p)\n",
+ "\n",
+ " # UniProt derived databases\n",
+ " elif p.startswith(\"uniref\") or p == \"uniparc\":\n",
+ " classification[\"uniprot_derived_db\"].append(p)\n",
+ "\n",
+ " # subtype-specific identifiers\n",
+ " elif p in SUBTYPE_MAPPING or any(token in p for token in [\"_pro\", \"_trs\"]):\n",
+ " classification[\"subtype_mapping\"].append(p)\n",
+ "\n",
+ " # deprecated identifiers\n",
+ " elif p == \"gi\":\n",
+ " classification[\"deprecated_identifier\"].append(p)\n",
+ "\n",
+ " # taxonomy identifiers\n",
+ " elif p == \"ncbi_taxid\":\n",
+ " classification[\"taxonomy_identifier\"].append(p)\n",
+ "\n",
+ " # external database candidate\n",
+ " else:\n",
+ " classification[\"external_database_candidate\"].append(p)\n",
+ "\n",
+ "for k, v in classification.items():\n",
+ " print(f\"\\n{k} ({len(v)}):\")\n",
+ " print(sorted(v))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d23a14b8-8423-4ad4-9b2e-48091284899e",
+ "metadata": {},
+ "source": [
+ "## Load parquet prefixes\n",
+ "\n",
+ "The following code must be run on BERDL."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "95517508-c76d-4929-8597-d7ec565ae43c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from berdl_notebook_utils.setup_spark_session import get_spark_session\n",
+ "from pyspark.sql import SparkSession\n",
+ "from pyspark.sql.functions import col, lower"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "5acde194-8e7d-4d7c-8048-c5e37bd3ac51",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spark = get_spark_session(\"PrefixExploration\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "27761389-8c58-4f8b-a44a-0a520a23d787",
+ "metadata": {
+ "collapsed": true,
+ "jupyter": {
+ "outputs_hidden": true
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "root\n",
+ " |-- entity_id: string (nullable = true)\n",
+ " |-- db: string (nullable = true)\n",
+ " |-- xref: string (nullable = true)\n",
+ " |-- description: string (nullable = true)\n",
+ " |-- _dlt_load_id: string (nullable = true)\n",
+ " |-- _dlt_id: string (nullable = true)\n",
+ " |-- relationship: string (nullable = true)\n",
+ "\n",
+ "Databases in the `identifiers` table:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "{'ABCD',\n",
+ " 'AGR',\n",
+ " 'Agora',\n",
+ " 'Allergome',\n",
+ " 'AlphaFoldDB',\n",
+ " 'AntiFam',\n",
+ " 'Antibodypedia',\n",
+ " 'ArachnoServer',\n",
+ " 'Araport',\n",
+ " 'BMRB',\n",
+ " 'BRENDA',\n",
+ " 'Bgee',\n",
+ " 'BindingDB',\n",
+ " 'BioCyc',\n",
+ " 'BioGRID',\n",
+ " 'BioGRID-ORCS',\n",
+ " 'BioMuta',\n",
+ " 'CARD',\n",
+ " 'CAZy',\n",
+ " 'CCDS',\n",
+ " 'CD-CODE',\n",
+ " 'CDD',\n",
+ " 'CGD',\n",
+ " 'CIViC',\n",
+ " 'CORUM',\n",
+ " 'CPTAC',\n",
+ " 'CPTC',\n",
+ " 'CTD',\n",
+ " 'CarbonylDB',\n",
+ " 'ChEMBL',\n",
+ " 'ChiTaRS',\n",
+ " 'ClinPGx',\n",
+ " 'CollecTF',\n",
+ " 'ComplexPortal',\n",
+ " 'ConoServer',\n",
+ " 'DEPOD',\n",
+ " 'DIP',\n",
+ " 'DMDM',\n",
+ " 'DNASU',\n",
+ " 'DisGeNET',\n",
+ " 'DisProt',\n",
+ " 'DrugBank',\n",
+ " 'DrugCentral',\n",
+ " 'EC',\n",
+ " 'ELM',\n",
+ " 'EMDB',\n",
+ " 'ESTHER',\n",
+ " 'EchoBASE',\n",
+ " 'EnsemblBacteria',\n",
+ " 'EnsemblFungi',\n",
+ " 'EnsemblMetazoa',\n",
+ " 'EnsemblPlants',\n",
+ " 'EnsemblProtists',\n",
+ " 'EvolutionaryTrace',\n",
+ " 'ExpressionAtlas',\n",
+ " 'FlyBase',\n",
+ " 'FunCoup',\n",
+ " 'FunFam',\n",
+ " 'GO',\n",
+ " 'Gene3D',\n",
+ " 'GeneCards',\n",
+ " 'GeneID',\n",
+ " 'GeneReviews',\n",
+ " 'GeneTree',\n",
+ " 'GeneWiki',\n",
+ " 'GenomeRNAi',\n",
+ " 'GlyConnect',\n",
+ " 'GlyCosmos',\n",
+ " 'GlyGen',\n",
+ " 'Gramene',\n",
+ " 'GuidetoPHARMACOLOGY',\n",
+ " 'HAMAP',\n",
+ " 'HGNC',\n",
+ " 'HOGENOM',\n",
+ " 'HPA',\n",
+ " 'IDEAL',\n",
+ " 'IMGT_GENE-DB',\n",
+ " 'InParanoid',\n",
+ " 'IntAct',\n",
+ " 'InterPro',\n",
+ " 'JaponicusDB',\n",
+ " 'KEGG',\n",
+ " 'LegioList',\n",
+ " 'Leproma',\n",
+ " 'MEROPS',\n",
+ " 'MGI',\n",
+ " 'MIM',\n",
+ " 'MINT',\n",
+ " 'MaizeGDB',\n",
+ " 'MalaCards',\n",
+ " 'MassIVE',\n",
+ " 'MetOSite',\n",
+ " 'MoonDB',\n",
+ " 'MoonProt',\n",
+ " 'NCBITaxon',\n",
+ " 'NCBIfam',\n",
+ " 'NIAGADS',\n",
+ " 'OGP',\n",
+ " 'OMA',\n",
+ " 'OpenTargets',\n",
+ " 'Orphanet',\n",
+ " 'OrthoDB',\n",
+ " 'PAN-GO',\n",
+ " 'PANTHER',\n",
+ " 'PATRIC',\n",
+ " 'PCDDB',\n",
+ " 'PDB',\n",
+ " 'PDBsum',\n",
+ " 'PHI-base',\n",
+ " 'PIR',\n",
+ " 'PIRSF',\n",
+ " 'PRIDE',\n",
+ " 'PRINTS',\n",
+ " 'PRO',\n",
+ " 'PROSITE',\n",
+ " 'PathwayCommons',\n",
+ " 'PaxDb',\n",
+ " 'PeptideAtlas',\n",
+ " 'PeroxiBase',\n",
+ " 'Pfam',\n",
+ " 'Pharos',\n",
+ " 'PhosphoSitePlus',\n",
+ " 'PhylomeDB',\n",
+ " 'PlantReactome',\n",
+ " 'PomBase',\n",
+ " 'ProMEX',\n",
+ " 'Proteomes',\n",
+ " 'ProteomicsDB',\n",
+ " 'PseudoCAP',\n",
+ " 'Pumba',\n",
+ " 'REBASE',\n",
+ " 'REPRODUCTION-2DPAGE',\n",
+ " 'RGD',\n",
+ " 'RNAct',\n",
+ " 'Reactome',\n",
+ " 'SABIO-RK',\n",
+ " 'SASBDB',\n",
+ " 'SFLD',\n",
+ " 'SGD',\n",
+ " 'SIGNOR',\n",
+ " 'SMART',\n",
+ " 'SMR',\n",
+ " 'STRENDA-DB',\n",
+ " 'STRING',\n",
+ " 'SUPFAM',\n",
+ " 'SignaLink',\n",
+ " 'SwissLipids',\n",
+ " 'SwissPalm',\n",
+ " 'TAIR',\n",
+ " 'TCDB',\n",
+ " 'TopDownProteomics',\n",
+ " 'TubercuList',\n",
+ " 'UCSC',\n",
+ " 'UniLectin',\n",
+ " 'UniPathway',\n",
+ " 'UniProt',\n",
+ " 'VEuPathDB',\n",
+ " 'VGNC',\n",
+ " 'WBParaSite',\n",
+ " 'WormBase',\n",
+ " 'Xenbase',\n",
+ " 'YCharOS',\n",
+ " 'ZFIN',\n",
+ " 'dictyBase',\n",
+ " 'eggNOG',\n",
+ " 'ensembl',\n",
+ " 'euHCVdb',\n",
+ " 'genbank',\n",
+ " 'iPTMnet',\n",
+ " 'jPOST',\n",
+ " 'refseq'}"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Load the identifier table from the uniprot raw dataset on BERDL\n",
+ "# this dataset has already been partitioned by `db` column to make it quicker to access\n",
+ "df = spark.read.parquet(\n",
+ " \"s3a://cdm-lake/tenant-general-warehouse/kbase/datasets/uniprot/uniprot_kb/identifier_partitioned\"\n",
+ ")\n",
+ "df.printSchema()\n",
+ "# get the distinct prefixes from the identifier table\n",
+ "identifier_table_dbs = {d.asDict().get(\"db\") for d in df.select(\"db\").distinct().collect()}\n",
+ "print(\"Databases in the `identifiers` table:\")\n",
+ "identifier_table_dbs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "3300715d-1098-4dcf-9ae1-58abcd3d44e9",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1421"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# save the identifier table db prefixes to file for easier querying without needing BERDL access\n",
+ "outfile = Path(\"data\") / \"identifier_table_prefixes.txt\"\n",
+ "outfile.write_text(\"\\n\".join(sorted(identifier_table_dbs)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "b583ed7d-629e-4258-8211-39c39bce66dc",
+ "metadata": {
+ "collapsed": true,
+ "jupyter": {
+ "outputs_hidden": true
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Parquet prefixes: 171\n",
+ "{'ensemblprotists', 'peroxibase', 'strenda-db', 'civic', 'swisslipids', 'funfam', 'mgi', 'pro', 'corum', 'sfld', 'hogenom', 'biocyc', 'peptideatlas', 'echobase', 'drugbank', 'tuberculist', 'allergome', 'carbonyldb', 'prints', 'antibodypedia', 'smr', 'supfam', 'phosphositeplus', 'jpost', 'chitars', 'reactome', 'tair', 'expressionatlas', 'ncbitaxon', 'genewiki', 'esther', 'genbank', 'pumba', 'genereviews', 'panther', 'rgd', 'proteomes', 'malacards', 'moondb', 'flybase', 'araport', 'hpa', 'agora', 'ensemblbacteria', 'reproduction-2dpage', 'cgd', 'funcoup', 'disgenet', 'sgd', 'inparanoid', 'ncbifam', 'glyconnect', 'ycharos', 'moonprot', 'phylomedb', 'pombase', 'dmdm', 'prosite', 'hamap', 'pir', 'orphanet', 'swisspalm', 'dictybase', 'ensemblfungi', 'glycosmos', 'japonicusdb', 'clinpgx', 'bindingdb', 'glygen', 'interpro', 'vgnc', 'arachnoserver', 'ensemblmetazoa', 'sabio-rk', 'string', 'gene3d', 'sasbdb', 'ccds', 'drugcentral', 'abcd', 'legiolist', 'opentargets', 'metosite', 'cd-code', 'imgt_gene-db', 'pirsf', 'go', 'niagads', 'emdb', 'gramene', 'pathwaycommons', 'veupathdb', 'ensemblplants', 'complexportal', 'ec', 'oma', 'topdownproteomics', 'patric', 'plantreactome', 'genomernai', 'pharos', 'biogrid-orcs', 'unilectin', 'rebase', 'cptc', 'maizegdb', 'eggnog', 'dnasu', 'evolutionarytrace', 'unipathway', 'cazy', 'pdb', 'euhcvdb', 'geneid', 'mint', 'ogp', 'depod', 'pdbsum', 'wormbase', 'signor', 'ctd', 'ucsc', 'elm', 'intact', 'pan-go', 'orthodb', 'uniprot', 'kegg', 'bmrb', 'bgee', 'disprot', 'iptmnet', 'smart', 'agr', 'pride', 'antifam', 'zfin', 'genetree', 'signalink', 'brenda', 'leproma', 'proteomicsdb', 'wbparasite', 'genecards', 'hgnc', 'rnact', 'cptac', 'refseq', 'pseudocap', 'paxdb', 'tcdb', 'phi-base', 'merops', 'collectf', 'pcddb', 'mim', 'ensembl', 'promex', 'chembl', 'pfam', 'cdd', 'biomuta', 'ideal', 'alphafolddb', 'conoserver', 'guidetopharmacology', 'dip', 'biogrid', 'card', 'xenbase', 'massive'}\n"
+ ]
+ }
+ ],
+ "source": [
+ "parquet_set = {db.lower() for db in identifier_table_dbs}\n",
+ "print(\"Parquet prefixes:\", len(parquet_set))\n",
+ "print(parquet_set)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "468a0752-b7b8-4403-91dd-d531d14b26d9",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Prefixes in the Identifier table that don't appear in the UniProt dblist prefixes (3):\n",
+ "['ec', 'ncbitaxon', 'uniprot']\n"
+ ]
+ }
+ ],
+ "source": [
+ "## Prefixes in parquet but not in UniProt official list\n",
+ "parquet_not_in_uniprot = parquet_set - uniprot_dblist_prefixes\n",
+ "print(\n",
+ " f\"Prefixes in the Identifier table that don't appear in the UniProt dblist prefixes ({len(parquet_not_in_uniprot)}):\"\n",
+ ")\n",
+ "print(sorted(parquet_not_in_uniprot))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d7a90b18-9c62-421b-92ce-9964e28338ed",
+ "metadata": {
+ "jupyter": {
+ "source_hidden": true
+ }
+ },
+ "source": [
+ "### Interpretation\n",
+ "\n",
+ "These are not true registry gaps:\n",
+ "\n",
+ "- **ec** – Represents EC numbers. \n",
+ "- **ncbitaxon** – A naming variation of NCBI Taxonomy.\n",
+ "- **uniprot** – UniProt itself is not listed as an external cross-reference database.\n",
+ "\n",
+ "Conclusion: No external databases detected\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "0ed15078-f074-4715-90ad-338f48ab329a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "external_candidates = classification[\"external_database_candidate\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "d89a5eef-7707-42d2-86e6-fa6cdafa6538",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "ensemblgenome | name=False | abbrev=False| parquet=False\n",
+ "nextprot | name=False | abbrev=False| parquet=False\n",
+ "pharmgkb | name=False | abbrev=False| parquet=False\n",
+ "treefam | name=False | abbrev=False| parquet=False\n"
+ ]
+ }
+ ],
+ "source": [
+ "for p in external_candidates:\n",
+ " in_name = p in uniprot_dblist_db_names\n",
+ " in_abbrev = p in uniprot_dblist_prefixes\n",
+ " in_parquet = p in parquet_set\n",
+ " print(f\"{p:20} | name={in_name} | abbrev={in_abbrev}| parquet={in_parquet}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cfd2a252-92df-4cb6-8804-cc78087599b5",
+ "metadata": {},
+ "source": [
+ "Some prefixes classified as external database candidates (e.g., nextprot, pharmgkb, treefam) do not currently appear in the BERDL parquet dataset.\n",
+ "\n",
+ "This indicates that while these namespaces correspond to real biological databases, they are not used in the current dataset snapshot. They remain classified as external databases based on their semantic meaning rather than dataset usage."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f76fe224-05e7-46a8-bd73-556e42dd55d5",
+ "metadata": {},
+ "source": [
+ "# Classification of Differences"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d1b9add8-cd8c-4a6a-8a6b-7f1aba920afe",
+ "metadata": {},
+ "source": [
+ "### A. UniProt annotation metadata \n",
+ "- crc64\n",
+ "- gene_name\n",
+ "- gene_orderedlocusname\n",
+ "- gene_orfname\n",
+ "- gene_synonym\n",
+ "- uniprotkb-id\n",
+ "\n",
+ "These fields represent internal UniProt annotations rather than cross-references to external databases. Examples include gene name annotations and sequence checksums maintained directly within UniProt records.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3c32f4bd-73d3-42cc-95d8-029b6df33340",
+ "metadata": {},
+ "source": [
+ "### B. UniProt derived databases \n",
+ "- uniparc \n",
+ "- uniref100\n",
+ "- uniref50\n",
+ "- uniref90\n",
+ "\n",
+ "These are UniProt internal resources, no need remapping. \n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5bae25fa-5e3d-4ea4-a7e6-0b1d7c73d07e",
+ "metadata": {},
+ "source": [
+ "### C. Internal NCBI identifiers\n",
+ "- gi\n",
+ "- ncbi_taxid\n",
+ " \n",
+ "ncbi_taxid is taxonomy identifier,\n",
+ "gi used by NCBI but has been officially deprecated.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6d910bdc-b542-4eb2-b9cf-bf933ccb1cc3",
+ "metadata": {},
+ "source": [
+ "### D. Database subtype mappings \n",
+ "- embl-cds\n",
+ "- refseq_nt\n",
+ "- ensembl_pro\n",
+ "- ensembl_trs\n",
+ "- ensemblgenome_pro\n",
+ "- ensemblgenome_trs\n",
+ "- wormbase_pro\n",
+ "- wormbase_trs\n",
+ "- wbparasite_trs_pro\n",
+ "\n",
+ "#### patterns:\n",
+ "- `_pro` → protein identifiers\n",
+ "- `_trs` → transcript identifiers\n",
+ "- `_cds` → coding sequence identifiers\n",
+ "- `_nt` → nucleotide accessions\n",
+ "\n",
+ "These indicate the identifier type within a parent database. Need normalize to parent database prefix.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1ff3e650-ec14-426e-a97e-74db77a62105",
+ "metadata": {},
+ "source": [
+ "### E. External database \n",
+ "\n",
+ "- ensemblgenome\n",
+ "- nextprot\n",
+ "- pharmgkb\n",
+ "- treefam\n",
+ "\n",
+ "#### examples\n",
+ "\n",
+ "- `EnsemblGenome` – genome annotation database\n",
+ "- `NextProt` – human protein knowledgebase\n",
+ "- `PharmGKB` – pharmacogenomics database\n",
+ "- `TreeFam` – phylogenetic gene family database"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "669bf324-fc95-4727-a02d-3dd6c2b64d57",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'prefix': 'crc64', 'bioregistry_found': False, 'normalized': None}\n",
+ "{'prefix': 'embl-cds', 'bioregistry_found': False, 'normalized': None}\n",
+ "{'prefix': 'ensembl_pro', 'bioregistry_found': False, 'normalized': None}\n",
+ "{'prefix': 'ensembl_trs', 'bioregistry_found': False, 'normalized': None}\n",
+ "{'prefix': 'ensemblgenome', 'bioregistry_found': False, 'normalized': None}\n",
+ "{'prefix': 'ensemblgenome_pro', 'bioregistry_found': False, 'normalized': None}\n",
+ "{'prefix': 'ensemblgenome_trs', 'bioregistry_found': False, 'normalized': None}\n",
+ "{'prefix': 'gene_name', 'bioregistry_found': False, 'normalized': None}\n",
+ "{'prefix': 'gene_orderedlocusname', 'bioregistry_found': False, 'normalized': None}\n",
+ "{'prefix': 'gene_orfname', 'bioregistry_found': False, 'normalized': None}\n",
+ "{'prefix': 'gene_synonym', 'bioregistry_found': False, 'normalized': None}\n",
+ "{'prefix': 'gi', 'bioregistry_found': False, 'normalized': None}\n",
+ "{'prefix': 'ncbi_taxid', 'bioregistry_found': True, 'normalized': 'ncbitaxon'}\n",
+ "{'prefix': 'nextprot', 'bioregistry_found': True, 'normalized': 'nextprot'}\n",
+ "{'prefix': 'pharmgkb', 'bioregistry_found': False, 'normalized': None}\n",
+ "{'prefix': 'refseq_nt', 'bioregistry_found': False, 'normalized': None}\n",
+ "{'prefix': 'treefam', 'bioregistry_found': True, 'normalized': 'treefam'}\n",
+ "{'prefix': 'uniparc', 'bioregistry_found': True, 'normalized': 'uniparc'}\n",
+ "{'prefix': 'uniprotkb-id', 'bioregistry_found': False, 'normalized': None}\n",
+ "{'prefix': 'uniref100', 'bioregistry_found': False, 'normalized': None}\n",
+ "{'prefix': 'uniref50', 'bioregistry_found': False, 'normalized': None}\n",
+ "{'prefix': 'uniref90', 'bioregistry_found': False, 'normalized': None}\n",
+ "{'prefix': 'wbparasite_trs_pro', 'bioregistry_found': False, 'normalized': None}\n",
+ "{'prefix': 'wormbase_pro', 'bioregistry_found': False, 'normalized': None}\n",
+ "{'prefix': 'wormbase_trs', 'bioregistry_found': False, 'normalized': None}\n"
+ ]
+ }
+ ],
+ "source": [
+ "prefixes = sorted(idmapping_not_in_dblist)\n",
+ "results = []\n",
+ "\n",
+ "for p in prefixes:\n",
+ " resource = br.get_resource(p)\n",
+ " normalized = br.normalize_prefix(p)\n",
+ "\n",
+ " results.append({\"prefix\": p, \"bioregistry_found\": resource is not None, \"normalized\": normalized})\n",
+ "\n",
+ "for r in results:\n",
+ " print(r)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "26e2628e-97a3-44dc-a235-19bf489c5b62",
+ "metadata": {},
+ "source": [
+ "## conclusion \n",
+ "\n",
+ "The Bioregistry package partially support prefix remapping, but it is not sufficient as a solution for the UniProt / BERDL prefix governance workflow."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a81adc6f-796d-4593-82ff-7f78f9809f25",
+ "metadata": {},
+ "source": [
+ "### Bioregistry package effective for: \n",
+ "\n",
+ "- Canonical prefix normalization\n",
+ "- Synonym resolution (e.g., ncbi_taxid → ncbitaxon)\n",
+ "- Validation of recognized external biological databases\n",
+ "\n",
+ "### Bioregistry does not cover: \n",
+ "\n",
+ "- Subtype-specific identifiers (e.g., ensembl_pro, refseq_nt)\n",
+ "- UniProt internal metadata fields (e.g., gene_name, crc64)\n",
+ "- UniProt-derived internal resources (e.g., uniref100)\n",
+ "- Deprecated identifiers, requires manual handling or exclusion (e.g., gi)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "9246627a-113b-4ffd-a471-21b03d22936f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'embl-cds': 'embl',\n",
+ " 'ensembl_pro': 'ensembl',\n",
+ " 'ensembl_trs': 'ensembl',\n",
+ " 'ensemblgenome_pro': 'ensemblgenome',\n",
+ " 'ensemblgenome_trs': 'ensemblgenome',\n",
+ " 'refseq_nt': 'refseq',\n",
+ " 'wbparasite_trs_pro': 'wbparasite',\n",
+ " 'wormbase_pro': 'wormbase',\n",
+ " 'wormbase_trs': 'wormbase'}"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "INTERNAL_PREFIXES = set(classification.get(\"internal_metadata\", []))\n",
+ "\n",
+ "## subtype has feature, xxx_pro/xxx_trs/xxx_nt/xxx_cds\n",
+ "SUBTYPE_TOKENS = {\"pro\", \"trs\", \"nt\", \"cds\"}\n",
+ "\n",
+ "\n",
+ "def infer_parent_prefix(prefix: str) -> str:\n",
+ " \"\"\"Deduce the parent prefix from the suffix.\n",
+ "\n",
+ " :param prefix: prefix string\n",
+ " :type prefix: str\n",
+ " :return: parent prefix\n",
+ " :rtype: str\n",
+ " \"\"\"\n",
+ " tokens = prefix.replace(\"-\", \"_\").split(\"_\")\n",
+ " tokens = [t for t in tokens if t not in SUBTYPE_TOKENS]\n",
+ " return \"_\".join(tokens)\n",
+ "\n",
+ "\n",
+ "SUBTYPE_RULES = {p: infer_parent_prefix(p) for p in classification.get(\"subtype_mapping\", [])}\n",
+ "\n",
+ "SUBTYPE_RULES"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "db2d89f7-2e4f-42f7-bcfb-cf8a999728da",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## INTERNAL_PREFIXES:\n",
+ "## if prefix.startswith(\"gene_\")\n",
+ "## if prefix in {\"crc64\"}\n",
+ "## if prefix.endswith(\"-id\")\n",
+ "\n",
+ "INTERNAL_KEYWORDS = {\n",
+ " \"crc64\", ## only need UniProt checksum, not namespace\n",
+ "}\n",
+ "\n",
+ "\n",
+ "def is_internal_prefix(prefix: str) -> bool:\n",
+ " prefix = prefix.lower()\n",
+ "\n",
+ " if prefix.startswith(\"gene_\"):\n",
+ " return True\n",
+ "\n",
+ " if prefix in INTERNAL_KEYWORDS:\n",
+ " return True\n",
+ "\n",
+ " return bool(prefix.endswith(\"-id\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "e5a85c1b-ef56-44cf-b833-f1f4d1b5a91f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'crc64',\n",
+ " 'gene_name',\n",
+ " 'gene_orderedlocusname',\n",
+ " 'gene_orfname',\n",
+ " 'gene_synonym',\n",
+ " 'uniprotkb-id'}"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "INTERNAL_PREFIXES = {p for p in idmapping_prefixes if is_internal_prefix(p)}\n",
+ "INTERNAL_PREFIXES"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "0fe486cc-f77b-4f52-8e17-221940439926",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "DEPRECATED_PREFIXES = set(classification.get(\"deprecated_identifier\", []))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "d9920dd9-4f4a-4cbf-a8f7-7f56498232af",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def remap_prefix(prefix: str) -> dict:\n",
+ " prefix = prefix.lower()\n",
+ "\n",
+ " if is_internal_prefix(prefix):\n",
+ " return {\"original\": prefix, \"canonical\": None, \"source\": \"internal\"}\n",
+ "\n",
+ " if prefix in DEPRECATED_PREFIXES:\n",
+ " return {\"original\": prefix, \"canonical\": None, \"source\": \"deprecated\"}\n",
+ "\n",
+ " if prefix in SUBTYPE_RULES:\n",
+ " return {\"original\": prefix, \"canonical\": SUBTYPE_RULES[prefix], \"source\": \"subtype\"}\n",
+ "\n",
+ " normalized = br.normalize_prefix(prefix)\n",
+ " if normalized:\n",
+ " return {\"original\": prefix, \"canonical\": normalized, \"source\": \"bioregistry\"}\n",
+ "\n",
+ " return {\"original\": prefix, \"canonical\": None, \"source\": \"unresolved\"}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "id": "70bba880-a266-4e37-a4ab-bf55db1fafd3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " original | \n",
+ " canonical | \n",
+ " source | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " allergome | \n",
+ " allergome | \n",
+ " bioregistry | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " arachnoserver | \n",
+ " arachnoserver | \n",
+ " bioregistry | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " araport | \n",
+ " araport | \n",
+ " bioregistry | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " biocyc | \n",
+ " biocyc | \n",
+ " bioregistry | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " biogrid | \n",
+ " biogrid | \n",
+ " bioregistry | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " biomuta | \n",
+ " NaN | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " ccds | \n",
+ " ccds | \n",
+ " bioregistry | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " cgd | \n",
+ " cgd | \n",
+ " bioregistry | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " chembl | \n",
+ " chembl | \n",
+ " bioregistry | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " chitars | \n",
+ " NaN | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " collectf | \n",
+ " NaN | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " complexportal | \n",
+ " complexportal | \n",
+ " bioregistry | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " conoserver | \n",
+ " conoserver | \n",
+ " bioregistry | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " cptac | \n",
+ " NaN | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " crc64 | \n",
+ " NaN | \n",
+ " internal | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " dictybase | \n",
+ " dictybase | \n",
+ " bioregistry | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " dip | \n",
+ " dip | \n",
+ " bioregistry | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " disprot | \n",
+ " disprot | \n",
+ " bioregistry | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " dmdm | \n",
+ " NaN | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " dnasu | \n",
+ " NaN | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " drugbank | \n",
+ " drugbank | \n",
+ " bioregistry | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " echobase | \n",
+ " echobase | \n",
+ " bioregistry | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " eggnog | \n",
+ " eggnog | \n",
+ " bioregistry | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " embl | \n",
+ " NaN | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " embl-cds | \n",
+ " embl | \n",
+ " subtype | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " original canonical source\n",
+ "0 allergome allergome bioregistry\n",
+ "1 arachnoserver arachnoserver bioregistry\n",
+ "2 araport araport bioregistry\n",
+ "3 biocyc biocyc bioregistry\n",
+ "4 biogrid biogrid bioregistry\n",
+ "5 biomuta NaN unresolved\n",
+ "6 ccds ccds bioregistry\n",
+ "7 cgd cgd bioregistry\n",
+ "8 chembl chembl bioregistry\n",
+ "9 chitars NaN unresolved\n",
+ "10 collectf NaN unresolved\n",
+ "11 complexportal complexportal bioregistry\n",
+ "12 conoserver conoserver bioregistry\n",
+ "13 cptac NaN unresolved\n",
+ "14 crc64 NaN internal\n",
+ "15 dictybase dictybase bioregistry\n",
+ "16 dip dip bioregistry\n",
+ "17 disprot disprot bioregistry\n",
+ "18 dmdm NaN unresolved\n",
+ "19 dnasu NaN unresolved\n",
+ "20 drugbank drugbank bioregistry\n",
+ "21 echobase echobase bioregistry\n",
+ "22 eggnog eggnog bioregistry\n",
+ "23 embl NaN unresolved\n",
+ "24 embl-cds embl subtype"
+ ]
+ },
+ "execution_count": 77,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "results = [remap_prefix(p) for p in sorted(idmapping_prefixes)]\n",
+ "df = pd.DataFrame(results)\n",
+ "df.head(25)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1f4eb260-c4e4-4d77-ae37-7028b3cb8760",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "21b03428-4fb3-410a-a956-0a256848b9f2",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "id": "dc6055ff-50e6-4fa1-bc63-2e0effcc8c64",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "source\n",
+ "bioregistry 56\n",
+ "unresolved 31\n",
+ "subtype 9\n",
+ "internal 6\n",
+ "deprecated 1\n",
+ "Name: count, dtype: int64\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(df[\"source\"].value_counts())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
+ "id": "92cdd16e-e812-4664-ab72-0f49488d33c3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " original | \n",
+ " source | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 5 | \n",
+ " biomuta | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " chitars | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " collectf | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " cptac | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " dmdm | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " dnasu | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " embl | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 29 | \n",
+ " ensemblgenome | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 32 | \n",
+ " esther | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 33 | \n",
+ " euhcvdb | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 41 | \n",
+ " genereviews | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 44 | \n",
+ " genomernai | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 46 | \n",
+ " glyconnect | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 47 | \n",
+ " guidetopharmacology | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 51 | \n",
+ " japonicusdb | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 53 | \n",
+ " legiolist | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 54 | \n",
+ " leproma | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 56 | \n",
+ " merops | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 62 | \n",
+ " oma | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 63 | \n",
+ " opentargets | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 66 | \n",
+ " patric | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 69 | \n",
+ " pharmgkb | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 70 | \n",
+ " phi-base | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 73 | \n",
+ " proteomicsdb | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 74 | \n",
+ " pseudocap | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 83 | \n",
+ " tair | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 91 | \n",
+ " uniref100 | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 92 | \n",
+ " uniref50 | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 93 | \n",
+ " uniref90 | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 94 | \n",
+ " veupathdb | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ " | 96 | \n",
+ " wbparasite | \n",
+ " unresolved | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " original source\n",
+ "5 biomuta unresolved\n",
+ "9 chitars unresolved\n",
+ "10 collectf unresolved\n",
+ "13 cptac unresolved\n",
+ "18 dmdm unresolved\n",
+ "19 dnasu unresolved\n",
+ "23 embl unresolved\n",
+ "29 ensemblgenome unresolved\n",
+ "32 esther unresolved\n",
+ "33 euhcvdb unresolved\n",
+ "41 genereviews unresolved\n",
+ "44 genomernai unresolved\n",
+ "46 glyconnect unresolved\n",
+ "47 guidetopharmacology unresolved\n",
+ "51 japonicusdb unresolved\n",
+ "53 legiolist unresolved\n",
+ "54 leproma unresolved\n",
+ "56 merops unresolved\n",
+ "62 oma unresolved\n",
+ "63 opentargets unresolved\n",
+ "66 patric unresolved\n",
+ "69 pharmgkb unresolved\n",
+ "70 phi-base unresolved\n",
+ "73 proteomicsdb unresolved\n",
+ "74 pseudocap unresolved\n",
+ "83 tair unresolved\n",
+ "91 uniref100 unresolved\n",
+ "92 uniref50 unresolved\n",
+ "93 uniref90 unresolved\n",
+ "94 veupathdb unresolved\n",
+ "96 wbparasite unresolved"
+ ]
+ },
+ "execution_count": 82,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "unresolved_df = df[df[\"source\"] == \"unresolved\"].sort_values(\"original\").drop(columns=[\"canonical\"])\n",
+ "unresolved_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8e7a4b10-6f74-4483-906b-cfb7de1e18e6",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2c272872-8c55-4be8-a3f9-e49954cdae23",
+ "metadata": {},
+ "source": [
+ "#### These are UniProt cross-reference databases. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "94ae4840-ac10-42cf-aac3-cd35a20a4104",
+ "metadata": {},
+ "source": [
+ "Uniprot cluster resources not in bioregistry. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b9dd3917-8ae6-44dc-b8ba-2e071b2eeff5",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "None\n",
+ "None\n",
+ "None\n",
+ "None\n"
+ ]
+ }
+ ],
+ "source": [
+ "## The mappings that already made when prefixes are existing in Bioregistry\n",
+ "\n",
+ "print(br.normalize_prefix(\"tair\"))\n",
+ "print(br.normalize_prefix(\"patric\"))\n",
+ "print(br.normalize_prefix(\"oma\"))\n",
+ "print(br.normalize_prefix(\"merops\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fbf6609b-18ae-4bfd-938f-f06cf2b6a5be",
+ "metadata": {},
+ "source": [
+ "These prefixes are for external biological databases not covered by the Bioregistry."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6d2f9ad1-a1cd-4e0c-8909-a8ae8d42980b",
+ "metadata": {},
+ "source": [
+ "### Final Evaluation\n",
+ "\n",
+ "Bioregistry was evaluated for prefix remapping.\n",
+ "- It directly recognizes 56 out of 103 observed prefixes;\n",
+ "- It correctly normalizes prefix variants; \n",
+ "- 31 prefixes are not covered by Bioregistry; these correspond mainly to UniProt-specific resources.\n",
+ "\n",
+ "After incorporating subtype rules, internal metadata handling and deprecated identifiers, ~70% of prefixes can be governed.\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.13.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}