Skip to content

Commit

Permalink
feat: add Uniprot (#470)
Browse files Browse the repository at this point in the history
Co-authored-by: Kevin Maik Jablonka <[email protected]>
Co-authored-by: Michael Pieler <[email protected]>
Co-authored-by: Kevin M Jablonka <[email protected]>
Co-authored-by: Michael Pieler <[email protected]>
  • Loading branch information
5 people authored Nov 18, 2023
1 parent 698f2ea commit ef1bb66
Show file tree
Hide file tree
Showing 9 changed files with 324 additions and 0 deletions.
58 changes: 58 additions & 0 deletions data/tabular/uniprot_binding_sites/meta.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
---
name: uniprot_binding_sites
description: |-
Binding sites of a molecule in protein sequences.
targets:
- id: start_binding_site
description: index for start of the binding sites of a protein
type: text
names:
- noun: start binding site
- id: end_binding_site
description: index for emd of the binding sites of a protein
type: text
names:
- noun: end binding site
- id: SMILES
description: SMILES
type: SMILES
names:
- noun: SMILES
identifiers:
- id: sequence
type: AS_SEQUENCE
description: other
license: MIT
links:
- url: https://www.uniprot.org/
description: data source
num_points: 780449
bibtex:
- |-
@article{10.1093/nar/gkac1052,
author = {The UniProt Consortium},
title = {UniProt - the Universal Protein Knowledgebase in 2023},
journal = {Nucleic Acids Research},
volume = {51},
number = {D1},
pages = {D523-D531},
year = {2022},
month = {11},
issn = {0305-1048},
doi = {10.1093/nar/gkac1052},
url = {https://doi.org/10.1093/nar/gkac1052}}
templates:
- |-
Question: What are the binding sites of the {#molecule|chemical|compound!} with {SMILES__description} {SMILES#} in this {#AA|amino acid!} sequence {sequence#}?
Answer: The binding site for the {#molecule|chemical|compound!} with the SMILES {SMILES#} in the given {#AA|amino acid!} sequence is: {start_binding_site#}-{end_binding_site#}.
- |-
Question: What molecule can bind in the binding site {start_binding_site#}-{end_binding_site#} in the amino acid sequence below?
{#AA|amino acid!} sequence: {sequence#}.
Answer: {SMILES#}
- |-
Task: Design a binding site in the {#AA|amino acid!} sequence {sequence#}, in which the {#molecule|chemical|compound!} with {SMILES__description} {SMILES#} can bind.
Answer: {start_binding_site#}-{end_binding_site#}
- |-
Task: Design a {#molecule|chemical|compound!} that binds to a given site in the {#AA|amino acid!} sequence {sequence#}.
Description: The binding site is {start_binding_site#}-{end_binding_site#}.
Answer: {SMILES#}
25 changes: 25 additions & 0 deletions data/tabular/uniprot_binding_sites/transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import pandas as pd
from huggingface_hub import hf_hub_download

DATA = "uniprot_binding_sites"


def load_dataset() -> pd.DataFrame:
uniprot = hf_hub_download(
repo_id="chemnlp/uniprot",
filename=f"{DATA}/data_clean.csv",
repo_type="dataset",
)
uniprot = pd.read_csv(uniprot)
uniprot.end_binding_site = uniprot.end_binding_site.astype(int)
uniprot.drop_duplicates(
inplace=True,
)
print(f"Successfully loaded {DATA}! {len(uniprot)} rows")
uniprot.to_csv("data_clean.csv", index=False)
print(f"Successfully loaded {DATA}!")
return uniprot


if __name__ == "__main__":
load_dataset()
47 changes: 47 additions & 0 deletions data/tabular/uniprot_organisms/meta.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
---
name: uniprot_organisms
description: |-
Organisms in which a amino-acid sequence can be found.
targets:
- id: organisms
description: organisms in which a protein can be found
type: text
names:
- noun: organisms
identifiers:
- id: other
type: AS_SEQUENCE
description: other
license: MIT
links:
- url: https://www.uniprot.org/
description: data source
num_points: 559428
bibtex:
- |-
@article{10.1093/nar/gkac1052,
author = {The UniProt Consortium},
title = {UniProt - the Universal Protein Knowledgebase in 2023},
journal = {Nucleic Acids Research},
volume = {51},
number = {D1},
pages = {D523-D531},
year = {2022},
month = {11},
issn = {0305-1048},
doi = {10.1093/nar/gkac1052},
url = {https://doi.org/10.1093/nar/gkac1052}}
templates:
- |-
The protein with the {#amino acid sequence|AA sequence!} {other#} can be found in {#the organism |!}{organisms#}.
- |-
Task: {#Predict|Identify!} the organism in which this {#protein|amino acid sequence|AA sequence|polypeptide!} can be found.
{#Amino acid sequence |Sequence|AA sequence!}: {other#}
Result: {organisms#}
- |-
User: In what organism can you find the following {#protein|amino acid sequence|AA sequence|polypeptide!}: {other#}?
Assistant: The given {#polypeptide|protein|amino acid sequence|AA sequence!} can be found in {organisms#}.
- |-
Task: {#Predict|Identify!} the organism in which this {#protein|amino acid sequence|AA sequence|polypeptide!} can be found.
{#Amino acid sequence |Sequence|AA sequence!}: {other#}
Result:<EOI> {organisms#}
25 changes: 25 additions & 0 deletions data/tabular/uniprot_organisms/transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import pandas as pd
from huggingface_hub import hf_hub_download

DATA = "uniprot_organisms"


def load_dataset() -> pd.DataFrame:
uniprot = hf_hub_download(
repo_id="chemnlp/uniprot",
filename=f"{DATA}/data_clean.csv",
repo_type="dataset",
)
uniprot = pd.read_csv(uniprot)
uniprot.rename(columns={"sequence": "other"}, inplace=True)
uniprot.drop_duplicates(
inplace=True,
)
print(f"Successfully loaded {DATA}! {len(uniprot)} rows")
uniprot.to_csv("data_clean.csv", index=False)
print(f"Successfully loaded {DATA}!")
return uniprot


if __name__ == "__main__":
load_dataset()
56 changes: 56 additions & 0 deletions data/tabular/uniprot_reactions/meta.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
---
name: uniprot_reactions
description: |-
Protein sequences and the reactions these can catalyze.
targets:
- id: reactions
description: biochemical reactions catalyzed by a protein
type: text
names:
- noun: chemical reactions
- noun: biochemical reactions
identifiers:
- id: other
type: AS_SEQUENCE
description: other
license: MIT
links:
- url: https://www.uniprot.org/
description: data source
num_points: 253713
bibtex:
- |-
@article{10.1093/nar/gkac1052,
author = {The UniProt Consortium},
title = {UniProt - the Universal Protein Knowledgebase in 2023},
journal = {Nucleic Acids Research},
volume = {51},
number = {D1},
pages = {D523-D531},
year = {2022},
month = {11},
issn = {0305-1048},
doi = {10.1093/nar/gkac1052},
url = {https://doi.org/10.1093/nar/gkac1052}}
templates:
- |-
The {#protein|amino acid sequence|AA sequence|polypeptide!} {#with the sequence |!}{other#} catalyzes the {#following |!}{#chemical |biochemical |!}reaction: {reactions#}
- |-
Task: {#Predict|Identify!} a {#biochemical |chemical |!}reaction that can be catalyzed by {#this|the following!} {#protein|amino acid sequence|AA sequence|polypeptide!}.
{#Amino acid sequence |Sequence|AA sequence!}: {other#}
Result: {reactions#}
- |-
Task: {#Generate|Create|Come up with!} a {#protein|amino acid sequence|AA sequence|polypeptide!} that can catalyze {#a|this!} specific {#biochemical |chemical |!}reaction.
Reaction: {reactions#}
{#Output|Result!}: {other#}
- |-
User: Can you {#tell me|come up with!} a {#biochemical |chemical |!}reaction that can be catalyzed by the following {#protein|amino acid sequence|AA sequence|polypeptide!}:\n{other#}
Assistant: {#Yes, the|Sure, the|Yes, sure, the|The!} {#chemical |biochemical |!}reaction that can be catalyzed by the given {#protein|amino acid sequence|AA sequence|polypeptide!} are:\n{reactions#}
- |-
Task: {#Predict|Identify!} a {#biochemical |chemical |!}reaction that can be catalyzed by {#this|the following!} {#protein|amino acid sequence|AA sequence|polypeptide!}.
{#Amino acid sequence |Sequence|AA sequence!}: {other#}
Result:<EOI> {reactions#}
- |-
Task: {#Generate|Create|Come up with|Design!} a {#protein|amino acid sequence|AA sequence|polypeptide!} that can catalyze {#a|this!} specific {#biochemical |chemical |!}reaction.
Reaction: {reactions#}
{#Output|Result!}:<EOI> {other#}
25 changes: 25 additions & 0 deletions data/tabular/uniprot_reactions/transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import pandas as pd
from huggingface_hub import hf_hub_download

DATA = "uniprot_reactions"


def load_dataset() -> pd.DataFrame:
uniprot = hf_hub_download(
repo_id="chemnlp/uniprot",
filename=f"{DATA}/data_clean.csv",
repo_type="dataset",
)
uniprot = pd.read_csv(uniprot)
uniprot.rename(columns={"sequence": "other"}, inplace=True)
uniprot.drop_duplicates(
inplace=True,
)
print(f"Successfully loaded {DATA}! {len(uniprot)} rows")
uniprot.to_csv("data_clean.csv", index=False)
print(f"Successfully loaded {DATA}!")
return uniprot


if __name__ == "__main__":
load_dataset()
48 changes: 48 additions & 0 deletions data/tabular/uniprot_sentences/meta.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
---
name: uniprot_sentences
description: |-
Descriptions of the function of a protein.
targets:
- id: sentences
description: sentences describing the function of a protein
type: text
names:
- noun: function
identifiers:
- id: sequence
type: AS_SEQUENCE
description: other
license: MIT
links:
- url: https://www.uniprot.org/
description: data source
num_points: 396241
bibtex:
- |-
@article{10.1093/nar/gkac1052,
author = {The UniProt Consortium},
title = {UniProt - the Universal Protein Knowledgebase in 2023},
journal = {Nucleic Acids Research},
volume = {51},
number = {D1},
pages = {D523-D531},
year = {2022},
month = {11},
issn = {0305-1048},
doi = {10.1093/nar/gkac1052},
url = {https://doi.org/10.1093/nar/gkac1052}}
templates:
- |-
User: {#Please describe|Describe|Please briefly describe|Briefly describe!} the {#biological |biochemical |!}function of {#the|this!} {#protein|amino acid sequence|AA sequence|polypeptide!}: {sequence#}
Assistant: {sentences#}.
- |-
User: What {#protein|amino acid sequence|AA sequence|polypeptide!} fits the {#biological |biochemical |!}description {#in the next sentence(s) |below |!}best?\n{sentences#}
Assistant: A {#protein|amino acid sequence|AA sequence|polypeptide!} that fits the {#description|sentences!} is:\n{sequence#}
- |-
Task: {#Generate|Create|Come up with!} a {#protein|amino acid sequence|AA sequence|polypeptide!} based on the description.
Description: {sentences#}
{#Output|Result!}: {sequence#}
- |-
Task: {#Generate|Create|Come up with!} a {#protein|amino acid sequence|AA sequence|polypeptide!} based on the description.
Description: {sentences#}
{#Output|Result!}:<EOI> {sequence#}
36 changes: 36 additions & 0 deletions data/tabular/uniprot_sentences/transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import pandas as pd
import regex as re
from huggingface_hub import hf_hub_download

DATA = "uniprot_sentences"


def clean_up_sentences(text: str) -> str:
"Remove (By similarity) from the sentences"

updated_text = re.sub(r"\s*\((?:By\.? similarity)\)\s*", "", text)
updated_text = updated_text.replace(" . ", ". ")
updated_text = updated_text.replace(" .", ".")
return updated_text


def load_dataset() -> pd.DataFrame:
uniprot = hf_hub_download(
repo_id="chemnlp/uniprot",
filename=f"{DATA}/data_clean.csv",
repo_type="dataset",
)

uniprot = pd.read_csv(uniprot)
uniprot.sentences = uniprot.sentences.apply(clean_up_sentences)
uniprot.drop_duplicates(
inplace=True,
)
print(f"Successfully loaded {DATA}! {len(uniprot)} rows")
uniprot.to_csv("data_clean.csv", index=False)
print(f"Successfully loaded {DATA}!")
return uniprot


if __name__ == "__main__":
load_dataset()
4 changes: 4 additions & 0 deletions data/text_sampling/text_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,10 @@
"mol_repr_transl_canonical_iupac_name",
"mol_repr_transl_inchi_iupac_name",
# "h2_storage_materials", # only IUPAC identifier, more than one target, LOW PRIO: has only 30 samples
"uniprot_binding_sites",
"uniprot_organisms",
"uniprot_reactions",
"uniprot_sentences",
]


Expand Down

0 comments on commit ef1bb66

Please sign in to comment.