-
Notifications
You must be signed in to change notification settings - Fork 45
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Co-authored-by: Kevin Maik Jablonka <[email protected]> Co-authored-by: Michael Pieler <[email protected]> Co-authored-by: Kevin M Jablonka <[email protected]> Co-authored-by: Michael Pieler <[email protected]>
- Loading branch information
1 parent
698f2ea
commit ef1bb66
Showing
9 changed files
with
324 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
--- | ||
name: uniprot_binding_sites | ||
description: |- | ||
Binding sites of a molecule in protein sequences. | ||
targets: | ||
- id: start_binding_site | ||
description: index for start of the binding sites of a protein | ||
type: text | ||
names: | ||
- noun: start binding site | ||
- id: end_binding_site | ||
description: index for emd of the binding sites of a protein | ||
type: text | ||
names: | ||
- noun: end binding site | ||
- id: SMILES | ||
description: SMILES | ||
type: SMILES | ||
names: | ||
- noun: SMILES | ||
identifiers: | ||
- id: sequence | ||
type: AS_SEQUENCE | ||
description: other | ||
license: MIT | ||
links: | ||
- url: https://www.uniprot.org/ | ||
description: data source | ||
num_points: 780449 | ||
bibtex: | ||
- |- | ||
@article{10.1093/nar/gkac1052, | ||
author = {The UniProt Consortium}, | ||
title = {UniProt - the Universal Protein Knowledgebase in 2023}, | ||
journal = {Nucleic Acids Research}, | ||
volume = {51}, | ||
number = {D1}, | ||
pages = {D523-D531}, | ||
year = {2022}, | ||
month = {11}, | ||
issn = {0305-1048}, | ||
doi = {10.1093/nar/gkac1052}, | ||
url = {https://doi.org/10.1093/nar/gkac1052}} | ||
templates: | ||
- |- | ||
Question: What are the binding sites of the {#molecule|chemical|compound!} with {SMILES__description} {SMILES#} in this {#AA|amino acid!} sequence {sequence#}? | ||
Answer: The binding site for the {#molecule|chemical|compound!} with the SMILES {SMILES#} in the given {#AA|amino acid!} sequence is: {start_binding_site#}-{end_binding_site#}. | ||
- |- | ||
Question: What molecule can bind in the binding site {start_binding_site#}-{end_binding_site#} in the amino acid sequence below? | ||
{#AA|amino acid!} sequence: {sequence#}. | ||
Answer: {SMILES#} | ||
- |- | ||
Task: Design a binding site in the {#AA|amino acid!} sequence {sequence#}, in which the {#molecule|chemical|compound!} with {SMILES__description} {SMILES#} can bind. | ||
Answer: {start_binding_site#}-{end_binding_site#} | ||
- |- | ||
Task: Design a {#molecule|chemical|compound!} that binds to a given site in the {#AA|amino acid!} sequence {sequence#}. | ||
Description: The binding site is {start_binding_site#}-{end_binding_site#}. | ||
Answer: {SMILES#} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
import pandas as pd | ||
from huggingface_hub import hf_hub_download | ||
|
||
DATA = "uniprot_binding_sites" | ||
|
||
|
||
def load_dataset() -> pd.DataFrame: | ||
uniprot = hf_hub_download( | ||
repo_id="chemnlp/uniprot", | ||
filename=f"{DATA}/data_clean.csv", | ||
repo_type="dataset", | ||
) | ||
uniprot = pd.read_csv(uniprot) | ||
uniprot.end_binding_site = uniprot.end_binding_site.astype(int) | ||
uniprot.drop_duplicates( | ||
inplace=True, | ||
) | ||
print(f"Successfully loaded {DATA}! {len(uniprot)} rows") | ||
uniprot.to_csv("data_clean.csv", index=False) | ||
print(f"Successfully loaded {DATA}!") | ||
return uniprot | ||
|
||
|
||
if __name__ == "__main__": | ||
load_dataset() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
--- | ||
name: uniprot_organisms | ||
description: |- | ||
Organisms in which a amino-acid sequence can be found. | ||
targets: | ||
- id: organisms | ||
description: organisms in which a protein can be found | ||
type: text | ||
names: | ||
- noun: organisms | ||
identifiers: | ||
- id: other | ||
type: AS_SEQUENCE | ||
description: other | ||
license: MIT | ||
links: | ||
- url: https://www.uniprot.org/ | ||
description: data source | ||
num_points: 559428 | ||
bibtex: | ||
- |- | ||
@article{10.1093/nar/gkac1052, | ||
author = {The UniProt Consortium}, | ||
title = {UniProt - the Universal Protein Knowledgebase in 2023}, | ||
journal = {Nucleic Acids Research}, | ||
volume = {51}, | ||
number = {D1}, | ||
pages = {D523-D531}, | ||
year = {2022}, | ||
month = {11}, | ||
issn = {0305-1048}, | ||
doi = {10.1093/nar/gkac1052}, | ||
url = {https://doi.org/10.1093/nar/gkac1052}} | ||
templates: | ||
- |- | ||
The protein with the {#amino acid sequence|AA sequence!} {other#} can be found in {#the organism |!}{organisms#}. | ||
- |- | ||
Task: {#Predict|Identify!} the organism in which this {#protein|amino acid sequence|AA sequence|polypeptide!} can be found. | ||
{#Amino acid sequence |Sequence|AA sequence!}: {other#} | ||
Result: {organisms#} | ||
- |- | ||
User: In what organism can you find the following {#protein|amino acid sequence|AA sequence|polypeptide!}: {other#}? | ||
Assistant: The given {#polypeptide|protein|amino acid sequence|AA sequence!} can be found in {organisms#}. | ||
- |- | ||
Task: {#Predict|Identify!} the organism in which this {#protein|amino acid sequence|AA sequence|polypeptide!} can be found. | ||
{#Amino acid sequence |Sequence|AA sequence!}: {other#} | ||
Result:<EOI> {organisms#} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
import pandas as pd | ||
from huggingface_hub import hf_hub_download | ||
|
||
DATA = "uniprot_organisms" | ||
|
||
|
||
def load_dataset() -> pd.DataFrame: | ||
uniprot = hf_hub_download( | ||
repo_id="chemnlp/uniprot", | ||
filename=f"{DATA}/data_clean.csv", | ||
repo_type="dataset", | ||
) | ||
uniprot = pd.read_csv(uniprot) | ||
uniprot.rename(columns={"sequence": "other"}, inplace=True) | ||
uniprot.drop_duplicates( | ||
inplace=True, | ||
) | ||
print(f"Successfully loaded {DATA}! {len(uniprot)} rows") | ||
uniprot.to_csv("data_clean.csv", index=False) | ||
print(f"Successfully loaded {DATA}!") | ||
return uniprot | ||
|
||
|
||
if __name__ == "__main__": | ||
load_dataset() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
--- | ||
name: uniprot_reactions | ||
description: |- | ||
Protein sequences and the reactions these can catalyze. | ||
targets: | ||
- id: reactions | ||
description: biochemical reactions catalyzed by a protein | ||
type: text | ||
names: | ||
- noun: chemical reactions | ||
- noun: biochemical reactions | ||
identifiers: | ||
- id: other | ||
type: AS_SEQUENCE | ||
description: other | ||
license: MIT | ||
links: | ||
- url: https://www.uniprot.org/ | ||
description: data source | ||
num_points: 253713 | ||
bibtex: | ||
- |- | ||
@article{10.1093/nar/gkac1052, | ||
author = {The UniProt Consortium}, | ||
title = {UniProt - the Universal Protein Knowledgebase in 2023}, | ||
journal = {Nucleic Acids Research}, | ||
volume = {51}, | ||
number = {D1}, | ||
pages = {D523-D531}, | ||
year = {2022}, | ||
month = {11}, | ||
issn = {0305-1048}, | ||
doi = {10.1093/nar/gkac1052}, | ||
url = {https://doi.org/10.1093/nar/gkac1052}} | ||
templates: | ||
- |- | ||
The {#protein|amino acid sequence|AA sequence|polypeptide!} {#with the sequence |!}{other#} catalyzes the {#following |!}{#chemical |biochemical |!}reaction: {reactions#} | ||
- |- | ||
Task: {#Predict|Identify!} a {#biochemical |chemical |!}reaction that can be catalyzed by {#this|the following!} {#protein|amino acid sequence|AA sequence|polypeptide!}. | ||
{#Amino acid sequence |Sequence|AA sequence!}: {other#} | ||
Result: {reactions#} | ||
- |- | ||
Task: {#Generate|Create|Come up with!} a {#protein|amino acid sequence|AA sequence|polypeptide!} that can catalyze {#a|this!} specific {#biochemical |chemical |!}reaction. | ||
Reaction: {reactions#} | ||
{#Output|Result!}: {other#} | ||
- |- | ||
User: Can you {#tell me|come up with!} a {#biochemical |chemical |!}reaction that can be catalyzed by the following {#protein|amino acid sequence|AA sequence|polypeptide!}:\n{other#} | ||
Assistant: {#Yes, the|Sure, the|Yes, sure, the|The!} {#chemical |biochemical |!}reaction that can be catalyzed by the given {#protein|amino acid sequence|AA sequence|polypeptide!} are:\n{reactions#} | ||
- |- | ||
Task: {#Predict|Identify!} a {#biochemical |chemical |!}reaction that can be catalyzed by {#this|the following!} {#protein|amino acid sequence|AA sequence|polypeptide!}. | ||
{#Amino acid sequence |Sequence|AA sequence!}: {other#} | ||
Result:<EOI> {reactions#} | ||
- |- | ||
Task: {#Generate|Create|Come up with|Design!} a {#protein|amino acid sequence|AA sequence|polypeptide!} that can catalyze {#a|this!} specific {#biochemical |chemical |!}reaction. | ||
Reaction: {reactions#} | ||
{#Output|Result!}:<EOI> {other#} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
import pandas as pd | ||
from huggingface_hub import hf_hub_download | ||
|
||
DATA = "uniprot_reactions" | ||
|
||
|
||
def load_dataset() -> pd.DataFrame: | ||
uniprot = hf_hub_download( | ||
repo_id="chemnlp/uniprot", | ||
filename=f"{DATA}/data_clean.csv", | ||
repo_type="dataset", | ||
) | ||
uniprot = pd.read_csv(uniprot) | ||
uniprot.rename(columns={"sequence": "other"}, inplace=True) | ||
uniprot.drop_duplicates( | ||
inplace=True, | ||
) | ||
print(f"Successfully loaded {DATA}! {len(uniprot)} rows") | ||
uniprot.to_csv("data_clean.csv", index=False) | ||
print(f"Successfully loaded {DATA}!") | ||
return uniprot | ||
|
||
|
||
if __name__ == "__main__": | ||
load_dataset() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
--- | ||
name: uniprot_sentences | ||
description: |- | ||
Descriptions of the function of a protein. | ||
targets: | ||
- id: sentences | ||
description: sentences describing the function of a protein | ||
type: text | ||
names: | ||
- noun: function | ||
identifiers: | ||
- id: sequence | ||
type: AS_SEQUENCE | ||
description: other | ||
license: MIT | ||
links: | ||
- url: https://www.uniprot.org/ | ||
description: data source | ||
num_points: 396241 | ||
bibtex: | ||
- |- | ||
@article{10.1093/nar/gkac1052, | ||
author = {The UniProt Consortium}, | ||
title = {UniProt - the Universal Protein Knowledgebase in 2023}, | ||
journal = {Nucleic Acids Research}, | ||
volume = {51}, | ||
number = {D1}, | ||
pages = {D523-D531}, | ||
year = {2022}, | ||
month = {11}, | ||
issn = {0305-1048}, | ||
doi = {10.1093/nar/gkac1052}, | ||
url = {https://doi.org/10.1093/nar/gkac1052}} | ||
templates: | ||
- |- | ||
User: {#Please describe|Describe|Please briefly describe|Briefly describe!} the {#biological |biochemical |!}function of {#the|this!} {#protein|amino acid sequence|AA sequence|polypeptide!}: {sequence#} | ||
Assistant: {sentences#}. | ||
- |- | ||
User: What {#protein|amino acid sequence|AA sequence|polypeptide!} fits the {#biological |biochemical |!}description {#in the next sentence(s) |below |!}best?\n{sentences#} | ||
Assistant: A {#protein|amino acid sequence|AA sequence|polypeptide!} that fits the {#description|sentences!} is:\n{sequence#} | ||
- |- | ||
Task: {#Generate|Create|Come up with!} a {#protein|amino acid sequence|AA sequence|polypeptide!} based on the description. | ||
Description: {sentences#} | ||
{#Output|Result!}: {sequence#} | ||
- |- | ||
Task: {#Generate|Create|Come up with!} a {#protein|amino acid sequence|AA sequence|polypeptide!} based on the description. | ||
Description: {sentences#} | ||
{#Output|Result!}:<EOI> {sequence#} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import pandas as pd | ||
import regex as re | ||
from huggingface_hub import hf_hub_download | ||
|
||
DATA = "uniprot_sentences" | ||
|
||
|
||
def clean_up_sentences(text: str) -> str: | ||
"Remove (By similarity) from the sentences" | ||
|
||
updated_text = re.sub(r"\s*\((?:By\.? similarity)\)\s*", "", text) | ||
updated_text = updated_text.replace(" . ", ". ") | ||
updated_text = updated_text.replace(" .", ".") | ||
return updated_text | ||
|
||
|
||
def load_dataset() -> pd.DataFrame: | ||
uniprot = hf_hub_download( | ||
repo_id="chemnlp/uniprot", | ||
filename=f"{DATA}/data_clean.csv", | ||
repo_type="dataset", | ||
) | ||
|
||
uniprot = pd.read_csv(uniprot) | ||
uniprot.sentences = uniprot.sentences.apply(clean_up_sentences) | ||
uniprot.drop_duplicates( | ||
inplace=True, | ||
) | ||
print(f"Successfully loaded {DATA}! {len(uniprot)} rows") | ||
uniprot.to_csv("data_clean.csv", index=False) | ||
print(f"Successfully loaded {DATA}!") | ||
return uniprot | ||
|
||
|
||
if __name__ == "__main__": | ||
load_dataset() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters