OpenBioML · kjappelbaum · Nov 18, 2023 · Oct 30, 2023 · Nov 1, 2023 · Nov 1, 2023
diff --git a/data/tabular/uniprot_binding_sites/meta.yaml b/data/tabular/uniprot_binding_sites/meta.yaml
@@ -0,0 +1,37 @@
+---
+name: uniprot_binding_sites
+description: |-
+    Descriptions of the function of a protein.
+targets:
+    - id: binding_sites
+      description: binding sites of a protein
+      type: text
+      names:
+          - noun: binding sites
+identifiers:
+    - id: other
+      type: other
+      description: other
+license: MIT
+links:
+    - url: https://www.uniprot.org/
+      description: data source
+num_points: 216329
+bibtex:
+    - |-
+      @article{10.1093/nar/gkac1052,
+      author = {The UniProt Consortium},
+      title = {UniProt - the Universal Protein Knowledgebase in 2023},
+      journal = {Nucleic Acids Research},
+      volume = {51},
+      number = {D1},
+      pages = {D523-D531},
+      year = {2022},
+      month = {11},
+      issn = {0305-1048},
+      doi = {10.1093/nar/gkac1052},
+      url = {https://doi.org/10.1093/nar/gkac1052}}
+templates:
+    - |-
+      User: What are the binding sites indices in this {#protein|amino-acid sequence|AA sequence|polypeptide!} {other#}?
+      Assistant: The binding sites indices are: {binding_sites#}.
diff --git a/data/tabular/uniprot_binding_sites/transform.py b/data/tabular/uniprot_binding_sites/transform.py
@@ -0,0 +1,17 @@
+import pandas as pd
+
+FILENAME = "uniprot_binding_sites"
+
+
+def load_dataset() -> pd.DataFrame:
+    uniprot = pd.read_csv(
+        f"https://huggingface.co/datasets/chemNLP/uniprot/resolve/main/{FILENAME}/data_clean.csv"  # noqa: E501
+    )
+    uniprot.rename(columns={"sequence": "other"}, inplace=True)
+    uniprot.to_csv("data_clean.csv", index=False)
+    print(f"Successfully loaded {FILENAME}!")
+    return uniprot
+
+
+if __name__ == "__main__":
+    load_dataset()
diff --git a/data/tabular/uniprot_organisms/meta.yaml b/data/tabular/uniprot_organisms/meta.yaml
@@ -0,0 +1,37 @@
+---
+name: uniprot_organisms
+description: |-
+    Organisms in which a amino-acid sequence can be found.
+targets:
+    - id: organisms
+      description: organisms in which a protein can be found
+      type: text
+      names:
+          - noun: organisms
+identifiers:
+    - id: other
+      type: other
+      description: other
+license: MIT
+links:
+    - url: https://www.uniprot.org/
+      description: data source
+num_points: 560033
+bibtex:
+    - |-
+      @article{10.1093/nar/gkac1052,
+      author = {The UniProt Consortium},
+      title = {UniProt - the Universal Protein Knowledgebase in 2023},
+      journal = {Nucleic Acids Research},
+      volume = {51},
+      number = {D1},
+      pages = {D523-D531},
+      year = {2022},
+      month = {11},
+      issn = {0305-1048},
+      doi = {10.1093/nar/gkac1052},
+      url = {https://doi.org/10.1093/nar/gkac1052}}
+templates:
+    - |-
+      User: In what organism can you find the following {#protein|amino-acid sequence|AA sequence|polypeptide!} {other#}?
+      Assistant: The given {#polypeptide|protein|amino-acid sequence|AA sequence!} can be found in {organisms#}.
diff --git a/data/tabular/uniprot_organisms/transform.py b/data/tabular/uniprot_organisms/transform.py
@@ -0,0 +1,17 @@
+import pandas as pd
+
+FILENAME = "uniprot_organisms"
+
+
+def load_dataset() -> pd.DataFrame:
+    uniprot = pd.read_csv(
+        f"https://huggingface.co/datasets/chemNLP/uniprot/resolve/main/{FILENAME}/data_clean.csv"  # noqa: E501
+    )
+    uniprot.rename(columns={"sequence": "other"}, inplace=True)
+    uniprot.to_csv("data_clean.csv", index=False)
+    print(f"Successfully loaded {FILENAME}!")
+    return uniprot
+
+
+if __name__ == "__main__":
+    load_dataset()
diff --git a/data/tabular/uniprot_reactions/meta.yaml b/data/tabular/uniprot_reactions/meta.yaml
@@ -0,0 +1,38 @@
+---
+name: uniprot_reactions
+description: |-
+    Protein sequences and the reactions these can catalyze.
+targets:
+    - id: reactions
+      description: biochemical reactions catalyzed by a protein
+      type: text
+      names:
+          - noun: chemical reactions
+          - noun: biochemical reactions
+identifiers:
+    - id: other
+      type: other
+      description: other
+license: MIT
+links:
+    - url: https://www.uniprot.org/
+      description: data source
+num_points: 253713
+bibtex:
+    - |-
+      @article{10.1093/nar/gkac1052,
+      author = {The UniProt Consortium},
+      title = {UniProt - the Universal Protein Knowledgebase in 2023},
+      journal = {Nucleic Acids Research},
+      volume = {51},
+      number = {D1},
+      pages = {D523-D531},
+      year = {2022},
+      month = {11},
+      issn = {0305-1048},
+      doi = {10.1093/nar/gkac1052},
+      url = {https://doi.org/10.1093/nar/gkac1052}}
+templates:
+    - |-
+      User: What {#biochemical|chemical|bio-chemical!} reactions can be catalyzed by the following {#protein|amino-acid sequence|AA sequence|polypeptide!} : {other#}?
+      Assistant: The reactions that can be catalyzed by the given sequence are: {reactions#}.
diff --git a/data/tabular/uniprot_reactions/transform.py b/data/tabular/uniprot_reactions/transform.py
@@ -0,0 +1,17 @@
+import pandas as pd
+
+FILENAME = "uniprot_reactions"
+
+
+def load_dataset() -> pd.DataFrame:
+    uniprot = pd.read_csv(
+        f"https://huggingface.co/datasets/chemNLP/uniprot/resolve/main/{FILENAME}/data_clean.csv"  # noqa: E501
+    )
+    uniprot.rename(columns={"sequence": "other"}, inplace=True)
+    uniprot.to_csv("data_clean.csv", index=False)
+    print(f"Successfully loaded {FILENAME}!")
+    return uniprot
+
+
+if __name__ == "__main__":
+    load_dataset()
diff --git a/data/tabular/uniprot_sentences/meta.yaml b/data/tabular/uniprot_sentences/meta.yaml
@@ -0,0 +1,44 @@
+---
+name: uniprot_sentences
+description: |-
+    Descriptions of the function of a protein.
+targets:
+    - id: sentences
+      description: sentences describing the function of a protein
+      type: text
+      names:
+          - noun: function
+identifiers:
+    - id: other
+      type: other
+      description: other
+license: MIT
+links:
+    - url: https://www.uniprot.org/
+      description: data source
+num_points: 464396
+bibtex:
+    - |-
+      @article{10.1093/nar/gkac1052,
+      author = {The UniProt Consortium},
+      title = {UniProt - the Universal Protein Knowledgebase in 2023},
+      journal = {Nucleic Acids Research},
+      volume = {51},
+      number = {D1},
+      pages = {D523-D531},
+      year = {2022},
+      month = {11},
+      issn = {0305-1048},
+      doi = {10.1093/nar/gkac1052},
+      url = {https://doi.org/10.1093/nar/gkac1052}}
+templates:
+    - |-
+      User: Describe the {#function|biological function!} of the {#protein|amino-acid sequence|AA sequence|polypeptide!} {other#}?
+      Assistant: {sentences#}.
+    - |-
+      User: What {#protein|amino-acid sequence|AA sequence|polypeptide!} best fits the {#function|biological function!} described in the next sentence(s). {sentences#}
+      Assistant: The {#protein|amino-acid sequence|AA sequence|polypeptide!} that best fits the described function is {other#}.
+    - |-
+      Task: {#Generate|Create|Come up with!} a {AA sequence|protein} based on the description.
+      Descriptions: {sentences#}
+      {#Output|Result!}: {other#}
diff --git a/data/tabular/uniprot_sentences/transform.py b/data/tabular/uniprot_sentences/transform.py
@@ -0,0 +1,26 @@
+import pandas as pd
+import regex as re
+
+FILENAME = "uniprot_sentences"
+
+
+def remove_text_from_column(sentence: str) -> str:
+    # Replace "(By similarity)" with empty string and remove extra spaces
+    updated_text = re.sub(r"\s*\(By similarity\)", "", sentence)
+    return updated_text
+
+
+def load_dataset() -> pd.DataFrame:
+    uniprot = pd.read_csv(
+        f"https://huggingface.co/datasets/chemNLP/uniprot/resolve/main/{FILENAME}/data_clean.csv"  # noqa: E501
+    )
+
+    uniprot.rename(columns={"sequence": "other"}, inplace=True)
+    uniprot["sentences"] = uniprot["sentences"].apply(remove_text_from_column)
+    uniprot.to_csv("data_clean.csv", index=False)
+    print(f"Successfully loaded {FILENAME}!")
+    return uniprot
+
+
+if __name__ == "__main__":
+    load_dataset()