feat: add Uniprot (#470)

Co-authored-by: Kevin Maik Jablonka <[email protected]> Co-authored-by: Michael Pieler <[email protected]> Co-authored-by: Kevin M Jablonka <[email protected]> Co-authored-by: Michael Pieler <[email protected]>
OpenBioML · Nov 18, 2023 · ef1bb66 · ef1bb66
1 parent 698f2ea
commit ef1bb66
Show file tree

Hide file tree

Showing 9 changed files with 324 additions and 0 deletions.
diff --git a/data/tabular/uniprot_binding_sites/meta.yaml b/data/tabular/uniprot_binding_sites/meta.yaml
@@ -0,0 +1,58 @@
+---
+name: uniprot_binding_sites
+description: |-
+    Binding sites of a molecule in protein sequences.
+targets:
+    - id: start_binding_site
+      description: index for start of the binding sites of a protein
+      type: text
+      names:
+          - noun: start binding site
+    - id: end_binding_site
+      description: index for emd of the binding sites of a protein
+      type: text
+      names:
+          - noun: end binding site
+    - id: SMILES
+      description: SMILES
+      type: SMILES
+      names:
+          - noun: SMILES
+identifiers:
+    - id: sequence
+      type: AS_SEQUENCE
+      description: other
+license: MIT
+links:
+    - url: https://www.uniprot.org/
+      description: data source
+num_points: 780449
+bibtex:
+    - |-
+      @article{10.1093/nar/gkac1052,
+      author = {The UniProt Consortium},
+      title = {UniProt - the Universal Protein Knowledgebase in 2023},
+      journal = {Nucleic Acids Research},
+      volume = {51},
+      number = {D1},
+      pages = {D523-D531},
+      year = {2022},
+      month = {11},
+      issn = {0305-1048},
+      doi = {10.1093/nar/gkac1052},
+      url = {https://doi.org/10.1093/nar/gkac1052}}
+templates:
+    - |-
+      Question: What are the binding sites of the {#molecule|chemical|compound!} with {SMILES__description} {SMILES#} in this {#AA|amino acid!} sequence {sequence#}?
+      Answer: The binding site for the {#molecule|chemical|compound!} with the SMILES {SMILES#} in the given {#AA|amino acid!} sequence is: {start_binding_site#}-{end_binding_site#}.
+    - |-
+      Question: What molecule can bind in the binding site {start_binding_site#}-{end_binding_site#} in the amino acid sequence below?
+      {#AA|amino acid!} sequence: {sequence#}.
+      Answer: {SMILES#}
+    - |-
+      Task: Design a binding site in the {#AA|amino acid!} sequence {sequence#}, in which the {#molecule|chemical|compound!} with {SMILES__description} {SMILES#} can bind.
+      Answer: {start_binding_site#}-{end_binding_site#}
+    - |-
+      Task: Design a {#molecule|chemical|compound!} that binds to a given site in the {#AA|amino acid!} sequence {sequence#}.
+      Description: The binding site is {start_binding_site#}-{end_binding_site#}.
+      Answer: {SMILES#}
diff --git a/data/tabular/uniprot_binding_sites/transform.py b/data/tabular/uniprot_binding_sites/transform.py
@@ -0,0 +1,25 @@
+import pandas as pd
+from huggingface_hub import hf_hub_download
+
+DATA = "uniprot_binding_sites"
+
+
+def load_dataset() -> pd.DataFrame:
+    uniprot = hf_hub_download(
+        repo_id="chemnlp/uniprot",
+        filename=f"{DATA}/data_clean.csv",
+        repo_type="dataset",
+    )
+    uniprot = pd.read_csv(uniprot)
+    uniprot.end_binding_site = uniprot.end_binding_site.astype(int)
+    uniprot.drop_duplicates(
+        inplace=True,
+    )
+    print(f"Successfully loaded {DATA}! {len(uniprot)} rows")
+    uniprot.to_csv("data_clean.csv", index=False)
+    print(f"Successfully loaded {DATA}!")
+    return uniprot
+
+
+if __name__ == "__main__":
+    load_dataset()
diff --git a/data/tabular/uniprot_organisms/meta.yaml b/data/tabular/uniprot_organisms/meta.yaml
@@ -0,0 +1,47 @@
+---
+name: uniprot_organisms
+description: |-
+    Organisms in which a amino-acid sequence can be found.
+targets:
+    - id: organisms
+      description: organisms in which a protein can be found
+      type: text
+      names:
+          - noun: organisms
+identifiers:
+    - id: other
+      type: AS_SEQUENCE
+      description: other
+license: MIT
+links:
+    - url: https://www.uniprot.org/
+      description: data source
+num_points: 559428
+bibtex:
+    - |-
+      @article{10.1093/nar/gkac1052,
+      author = {The UniProt Consortium},
+      title = {UniProt - the Universal Protein Knowledgebase in 2023},
+      journal = {Nucleic Acids Research},
+      volume = {51},
+      number = {D1},
+      pages = {D523-D531},
+      year = {2022},
+      month = {11},
+      issn = {0305-1048},
+      doi = {10.1093/nar/gkac1052},
+      url = {https://doi.org/10.1093/nar/gkac1052}}
+templates:
+    - |-
+      The protein with the {#amino acid sequence|AA sequence!} {other#} can be found in {#the organism |!}{organisms#}.
+    - |-
+      Task: {#Predict|Identify!} the organism in which this {#protein|amino acid sequence|AA sequence|polypeptide!} can be found.
+      {#Amino acid sequence |Sequence|AA sequence!}: {other#}
+      Result: {organisms#}
+    - |-
+      User: In what organism can you find the following {#protein|amino acid sequence|AA sequence|polypeptide!}: {other#}?
+      Assistant: The given {#polypeptide|protein|amino acid sequence|AA sequence!} can be found in {organisms#}.
+    - |-
+      Task: {#Predict|Identify!} the organism in which this {#protein|amino acid sequence|AA sequence|polypeptide!} can be found.
+      {#Amino acid sequence |Sequence|AA sequence!}: {other#}
+      Result:<EOI> {organisms#}
diff --git a/data/tabular/uniprot_organisms/transform.py b/data/tabular/uniprot_organisms/transform.py
@@ -0,0 +1,25 @@
+import pandas as pd
+from huggingface_hub import hf_hub_download
+
+DATA = "uniprot_organisms"
+
+
+def load_dataset() -> pd.DataFrame:
+    uniprot = hf_hub_download(
+        repo_id="chemnlp/uniprot",
+        filename=f"{DATA}/data_clean.csv",
+        repo_type="dataset",
+    )
+    uniprot = pd.read_csv(uniprot)
+    uniprot.rename(columns={"sequence": "other"}, inplace=True)
+    uniprot.drop_duplicates(
+        inplace=True,
+    )
+    print(f"Successfully loaded {DATA}! {len(uniprot)} rows")
+    uniprot.to_csv("data_clean.csv", index=False)
+    print(f"Successfully loaded {DATA}!")
+    return uniprot
+
+
+if __name__ == "__main__":
+    load_dataset()
diff --git a/data/tabular/uniprot_reactions/meta.yaml b/data/tabular/uniprot_reactions/meta.yaml
@@ -0,0 +1,56 @@
+---
+name: uniprot_reactions
+description: |-
+    Protein sequences and the reactions these can catalyze.
+targets:
+    - id: reactions
+      description: biochemical reactions catalyzed by a protein
+      type: text
+      names:
+          - noun: chemical reactions
+          - noun: biochemical reactions
+identifiers:
+    - id: other
+      type: AS_SEQUENCE
+      description: other
+license: MIT
+links:
+    - url: https://www.uniprot.org/
+      description: data source
+num_points: 253713
+bibtex:
+    - |-
+      @article{10.1093/nar/gkac1052,
+      author = {The UniProt Consortium},
+      title = {UniProt - the Universal Protein Knowledgebase in 2023},
+      journal = {Nucleic Acids Research},
+      volume = {51},
+      number = {D1},
+      pages = {D523-D531},
+      year = {2022},
+      month = {11},
+      issn = {0305-1048},
+      doi = {10.1093/nar/gkac1052},
+      url = {https://doi.org/10.1093/nar/gkac1052}}
+templates:
+    - |-
+      The {#protein|amino acid sequence|AA sequence|polypeptide!} {#with the sequence |!}{other#} catalyzes the {#following |!}{#chemical |biochemical |!}reaction: {reactions#}
+    - |-
+      Task: {#Predict|Identify!} a {#biochemical |chemical |!}reaction that can be catalyzed by {#this|the following!} {#protein|amino acid sequence|AA sequence|polypeptide!}.
+      {#Amino acid sequence |Sequence|AA sequence!}: {other#}
+      Result: {reactions#}
+    - |-
+      Task: {#Generate|Create|Come up with!} a {#protein|amino acid sequence|AA sequence|polypeptide!} that can catalyze {#a|this!} specific {#biochemical |chemical |!}reaction.
+      Reaction: {reactions#}
+      {#Output|Result!}: {other#}
+    - |-
+      User: Can you {#tell me|come up with!} a {#biochemical |chemical |!}reaction that can be catalyzed by the following {#protein|amino acid sequence|AA sequence|polypeptide!}:\n{other#}
+      Assistant: {#Yes, the|Sure, the|Yes, sure, the|The!} {#chemical |biochemical |!}reaction that can be catalyzed by the given {#protein|amino acid sequence|AA sequence|polypeptide!} are:\n{reactions#}
+    - |-
+      Task: {#Predict|Identify!} a {#biochemical |chemical |!}reaction that can be catalyzed by {#this|the following!} {#protein|amino acid sequence|AA sequence|polypeptide!}.
+      {#Amino acid sequence |Sequence|AA sequence!}: {other#}
+      Result:<EOI> {reactions#}
+    - |-
+      Task: {#Generate|Create|Come up with|Design!} a {#protein|amino acid sequence|AA sequence|polypeptide!} that can catalyze {#a|this!} specific {#biochemical |chemical |!}reaction.
+      Reaction: {reactions#}
+      {#Output|Result!}:<EOI> {other#}
diff --git a/data/tabular/uniprot_reactions/transform.py b/data/tabular/uniprot_reactions/transform.py
@@ -0,0 +1,25 @@
+import pandas as pd
+from huggingface_hub import hf_hub_download
+
+DATA = "uniprot_reactions"
+
+
+def load_dataset() -> pd.DataFrame:
+    uniprot = hf_hub_download(
+        repo_id="chemnlp/uniprot",
+        filename=f"{DATA}/data_clean.csv",
+        repo_type="dataset",
+    )
+    uniprot = pd.read_csv(uniprot)
+    uniprot.rename(columns={"sequence": "other"}, inplace=True)
+    uniprot.drop_duplicates(
+        inplace=True,
+    )
+    print(f"Successfully loaded {DATA}! {len(uniprot)} rows")
+    uniprot.to_csv("data_clean.csv", index=False)
+    print(f"Successfully loaded {DATA}!")
+    return uniprot
+
+
+if __name__ == "__main__":
+    load_dataset()
diff --git a/data/tabular/uniprot_sentences/meta.yaml b/data/tabular/uniprot_sentences/meta.yaml
@@ -0,0 +1,48 @@
+---
+name: uniprot_sentences
+description: |-
+    Descriptions of the function of a protein.
+targets:
+    - id: sentences
+      description: sentences describing the function of a protein
+      type: text
+      names:
+          - noun: function
+identifiers:
+    - id: sequence
+      type: AS_SEQUENCE
+      description: other
+license: MIT
+links:
+    - url: https://www.uniprot.org/
+      description: data source
+num_points: 396241
+bibtex:
+    - |-
+      @article{10.1093/nar/gkac1052,
+      author = {The UniProt Consortium},
+      title = {UniProt - the Universal Protein Knowledgebase in 2023},
+      journal = {Nucleic Acids Research},
+      volume = {51},
+      number = {D1},
+      pages = {D523-D531},
+      year = {2022},
+      month = {11},
+      issn = {0305-1048},
+      doi = {10.1093/nar/gkac1052},
+      url = {https://doi.org/10.1093/nar/gkac1052}}
+templates:
+    - |-
+      User: {#Please describe|Describe|Please briefly describe|Briefly describe!} the {#biological |biochemical |!}function of {#the|this!} {#protein|amino acid sequence|AA sequence|polypeptide!}: {sequence#}
+      Assistant: {sentences#}.
+    - |-
+      User: What {#protein|amino acid sequence|AA sequence|polypeptide!} fits the {#biological |biochemical |!}description {#in the next sentence(s) |below |!}best?\n{sentences#}
+      Assistant: A {#protein|amino acid sequence|AA sequence|polypeptide!} that fits the {#description|sentences!} is:\n{sequence#}
+    - |-
+      Task: {#Generate|Create|Come up with!} a {#protein|amino acid sequence|AA sequence|polypeptide!} based on the description.
+      Description: {sentences#}
+      {#Output|Result!}: {sequence#}
+    - |-
+      Task: {#Generate|Create|Come up with!} a {#protein|amino acid sequence|AA sequence|polypeptide!} based on the description.
+      Description: {sentences#}
+      {#Output|Result!}:<EOI> {sequence#}
diff --git a/data/tabular/uniprot_sentences/transform.py b/data/tabular/uniprot_sentences/transform.py
@@ -0,0 +1,36 @@
+import pandas as pd
+import regex as re
+from huggingface_hub import hf_hub_download
+
+DATA = "uniprot_sentences"
+
+
+def clean_up_sentences(text: str) -> str:
+    "Remove (By similarity) from the sentences"
+
+    updated_text = re.sub(r"\s*\((?:By\.? similarity)\)\s*", "", text)
+    updated_text = updated_text.replace(" . ", ". ")
+    updated_text = updated_text.replace(" .", ".")
+    return updated_text
+
+
+def load_dataset() -> pd.DataFrame:
+    uniprot = hf_hub_download(
+        repo_id="chemnlp/uniprot",
+        filename=f"{DATA}/data_clean.csv",
+        repo_type="dataset",
+    )
+
+    uniprot = pd.read_csv(uniprot)
+    uniprot.sentences = uniprot.sentences.apply(clean_up_sentences)
+    uniprot.drop_duplicates(
+        inplace=True,
+    )
+    print(f"Successfully loaded {DATA}! {len(uniprot)} rows")
+    uniprot.to_csv("data_clean.csv", index=False)
+    print(f"Successfully loaded {DATA}!")
+    return uniprot
+
+
+if __name__ == "__main__":
+    load_dataset()
diff --git a/data/text_sampling/text_sampling.py b/data/text_sampling/text_sampling.py
@@ -168,6 +168,10 @@
     "mol_repr_transl_canonical_iupac_name",
     "mol_repr_transl_inchi_iupac_name",
     # "h2_storage_materials",  # only IUPAC identifier, more than one target, LOW PRIO: has only 30 samples
+    "uniprot_binding_sites",
+    "uniprot_organisms",
+    "uniprot_reactions",
+    "uniprot_sentences",
 ]