OpenBioML · kjappelbaum · Nov 18, 2023 · Oct 30, 2023 · Nov 1, 2023 · Nov 1, 2023
diff --git a/data/tabular/uniprot/meta.yaml b/data/tabular/uniprot/meta.yaml
@@ -0,0 +1,44 @@
+---
+name: uniprot
+description: |-
+    Protein sequences, the reaction these can catalyze and the descriptions of the chemical reaction.
+targets:
+    - id: sentences
+      description: sentences describing the catalytic activity of a protein
+      names:
+          - noun: catalytic activity
+    - id: reactions
+      description: biochemical reactions catalyzed by a protein
+      names:
+          - noun: chemical reactions
+          - noun: biochemical reactions
+identifiers:
+    - id: sequence
+      type: sequence
+      description: sequence
+license: MIT
+links:
+    - url: https://www.uniprot.org/
+      description: data source
+num_points: 542630
+bibtex:
+    - |-
+      @article{10.1093/nar/gkac1052,
+      author = {The UniProt Consortium},
+      title = {UniProt - the Universal Protein Knowledgebase in 2023},
+      journal = {Nucleic Acids Research},
+      volume = {51},
+      number = {D1},
+      pages = {D523-D531},
+      year = {2022},
+      month = {11},
+      issn = {0305-1048},
+      doi = {10.1093/nar/gkac1052},
+      url = {https://doi.org/10.1093/nar/gkac1052}}
+templates:
+    - |-
+      User: Describe the {sentences__names__noun} of the {#protein|amino-acid sequence|AA sequence|polypeptide!} {sequence#}?
+      Assistant: {sentences#}.
+    - |-
+      User: What biochemical reactions can be catalyzed by the following {#protein|amino-acid sequence|AA sequence|polypeptide!}?
+      Assistant: The biochemical reactions that can be catalyzed by the given sequence are: {reactions#}
diff --git a/data/tabular/uniprot/process_dataset.ipynb b/data/tabular/uniprot/process_dataset.ipynb
@@ -0,0 +1,296 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "702d78bd-ba4b-4b47-8644-c62df4b21c0a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from typing import Dict\n",
+    "from urllib.request import urlopen \n",
+    "import regex as re\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "9a2657ec-b30c-4a2d-ba64-3a326c5447cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read json from url example\n",
+    "\n",
+    "def read_json(url :str):\n",
+    "\n",
+    "    read_json_from_url = urlopen(url)\n",
+    "    data_json = json.loads(read_json_from_url.read()) \n",
+    "    return data_json\n",
+    "\n",
+    "\n",
+    "def read_json_to_list_of_proteins(path : str):\n",
+    "    \"\"\"Reads json file and returns a dictionary\n",
+    "    Example usage:\n",
+    "        proteins = read_json_to_list_of_proteins(\"uniprotkb_AND_reviewed_true_2023_10_30.json\")\n",
+    "    \"\"\"\n",
+    "    uniprot_json = json.load(open(path))    \n",
+    "    proteins = uniprot_json[\"results\"]\n",
+    "    return proteins\n",
+    "\n",
+    "\n",
+    "def remove_pubchem_med_from_text(string : str):\n",
+    "    pattern = r'\\(PubMed:\\d+(?:,\\s*PubMed:\\d+)*\\)'\n",
+    "    string = re.sub(pattern, '', string)\n",
+    "    return string"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "07e8b508-26af-4dff-b026-3c520bb23d8c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENARIQSKLSDLQKKKIDIDNKLLKEKQNLIKEEILERKKLEVLTKKQQKDEIEHQKKLKREIDAIKASTQYITDVSISSYNNTIPETEPEYDLFISHASEDKEDFVRPLAETLQQLGVNVWYDEFTLKVGDSLRQKIDSGLRNSKYGTVVLSTDFIKKDWTNYELDGLVAREMNGHKMILPIWHKITKNDVLDYSPNLADKVALNTSVNSIEEIAHQLADVILNR'"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "example = read_json(\"https://rest.uniprot.org/uniprotkb/A0A009IHW8.json\")\n",
+    "example[\"sequence\"][\"value\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ca8435e6-38e1-48ec-96ce-900e4d7c02eb",
+   "metadata": {},
+   "source": [
+    "### Example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "id": "f859045f-031a-4fb3-8fcc-9b3b2340c729",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "CONSIDERED_COMMENTS = [\n",
+    "    \"CATALYTIC ACTIVITY\",\n",
+    "    \"FUNCTION\"\n",
+    "    \"DOMAIN\"\n",
+    "]\n",
+    "\n",
+    "\n",
+    "def extract_catalyzed_reactions(json_dict : Dict):\n",
+    "    # Define lists\n",
+    "    sentences : List = []\n",
+    "    reactions : List = []\n",
+    "    \n",
+    "    comments = json_dict[\"comments\"]\n",
+    "    for i in range(len(comments)):\n",
+    "        if comments[i][\"commentType\"] == \"FUNCTION\":\n",
+    "            sentence_description = json_dict[\"comments\"][i][\"texts\"][i][\"value\"]\n",
+    "            sentence_description = remove_pubchem_med_from_text(sentence_description)\n",
+    "            sentences.append(sentence_description)\n",
+    "        if comments[i][\"commentType\"] == \"CATALYTIC ACTIVITY\":    \n",
+    "            catalyzed_reaction = json_dict[\"comments\"][i][\"reaction\"][\"name\"]\n",
+    "            ### Replace = with -> for latex\n",
+    "            catalyzed_reaction = catalyzed_reaction.replace(\"=\", \"->\")\n",
+    "            reactions.append(catalyzed_reaction)\n",
+    "    return list(set(sentences)), list(set(reactions))\n",
+    "\n",
+    "\n",
+    "def extract_binding_sites(json_dict : Dict):\n",
+    "    binding_sites = [i[\"location\"][\"start\"][\"value\"] for i in example[\"features\"] if i[\"type\"] == \"Binding site\"] \n",
+    "    return binding_sites\n",
+    "\n",
+    "\n",
+    "def extract_all(json_dict):\n",
+    "    # try:\n",
+    "    sequence = json_dict[\"sequence\"][\"value\"]\n",
+    "    sentences, reactions = extract_catalyzed_reactions(json_dict)\n",
+    "    binding_sites = extract_binding_sites(json_dict)\n",
+    "    return {\"sequence\" : sequence,\n",
+    "            \"reactions\" : '|'.join([str(i) for i in reactions]), \n",
+    "            \"sentences\" : sentences,\n",
+    "            \"binding sites\" : ','.join([str(i) for i in binding_sites])\n",
+    "    }\n",
+    "    # except KeyError:\n",
+    "    #     print(\"err\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "6d7ed83d-bd6f-4b7c-9f89-71a3ed9a54c6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'sequence': {'MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENARIQSKLSDLQKKKIDIDNKLLKEKQNLIKEEILERKKLEVLTKKQQKDEIEHQKKLKREIDAIKASTQYITDVSISSYNNTIPETEPEYDLFISHASEDKEDFVRPLAETLQQLGVNVWYDEFTLKVGDSLRQKIDSGLRNSKYGTVVLSTDFIKKDWTNYELDGLVAREMNGHKMILPIWHKITKNDVLDYSPNLADKVALNTSVNSIEEIAHQLADVILNR': {'reactions': \"NAD(+) -> 2'cADPR + H(+) + nicotinamide|H2O + NADP(+) -> ADP-D-ribose 2'-phosphate + H(+) + nicotinamide|H2O + NAD(+) -> ADP-D-ribose + H(+) + nicotinamide\",\n",
+       "   'binding sites': '143,172,202,245'}}}"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "extract_all(example)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "845867c8-6457-4c3f-b359-e247f1d3c757",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "proteins = read_json_to_list_of_proteins(\"uniprotkb_AND_reviewed_true_2023_10_30.json\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "id": "3bb06419-8f01-4205-aa48-afebdfd68824",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "data = []\n",
+    "\n",
+    "for prot_id, prot_seq_json in enumerate(proteins):\n",
+    "    try:\n",
+    "        data.append({prot_id : extract_all(prot_seq_json)})\n",
+    "    except Exception:\n",
+    "        pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1586f0d7-70ea-4bfe-b0e4-6800cbe3e435",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "d295a250-d5e9-41fe-a3c4-84c47df9e5ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from collections import ChainMap"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "id": "f6a7b9f3-bb41-41ba-b283-547907d23fa6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{}"
+      ]
+     },
+     "execution_count": 67,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# data = dict(ChainMap(*data))\n",
+    "a = {}\n",
+    "a\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "id": "bcbc63f4-e586-4c1a-a5bf-4964161276d1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for dictionary in data:\n",
+    "    key = int(list(dictionary.keys())[0])\n",
+    "    value = list(dictionary.values())[0]\n",
+    "\n",
+    "    a[key] = value"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "id": "b432cca1-5582-4f5e-9c70-6af37f425559",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reactions = [a[i][\"reactions\"] for i in list(a.keys())]\n",
+    "sequences = [a[i][\"sequence\"] for i in list(a.keys())]\n",
+    "sentences = [a[i][\"sentences\"] for i in list(a.keys())]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "id": "2ccd268b-0902-45fe-89b1-6a83d2b3bcab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame()\n",
+    "\n",
+    "df[\"sequence\"] = sequences\n",
+    "df[\"sentences\"] = sentences\n",
+    "df[\"reactions\"] = reactions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "id": "3666eb32-e26c-4a96-a37d-73e064d725ed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.to_csv(\"reactions_sentences.csv\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/data/tabular/uniprot/transform.py b/data/tabular/uniprot/transform.py
@@ -0,0 +1,12 @@
+import pandas as pd
+
+
+# https://huggingface.co/datasets/chemNLP/uniprot/resolve/main/reactions_sentences.csv
+def load_dataset() -> pd.DataFrame:
+    uniprot = pd.read_csv("reactions_sentences.csv")
+    uniprot.to_csv("data_clean.csv", index=False)
+    return uniprot
+
+
+if __name__ == "__main__":
+    print(len(load_dataset()))