Skip to content

Commit

Permalink
add command line scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
kjappelbaum committed Aug 14, 2024
1 parent 9f1ec22 commit 0604d7c
Show file tree
Hide file tree
Showing 12 changed files with 56 additions and 48 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,5 @@ scratch/
*.swp

.DS_Store
sampled_data/
sampled_benchmark/
34 changes: 7 additions & 27 deletions data/tabular/bicerano_dataset/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,19 +1,6 @@
bibtex:
- "@article{afzal2021,
author = {Afzal, Mohammad Atif Faiz and Browning, Andrea R. and Goldberg, Alexander
and Halls, Mathew D. and Gavartin, Jacob L. and Morisato,
Tsuguo and Hughes, Thomas F. and Giesen, David J. and Goose, Joseph E.},
title = {High-Throughput Molecular Dynamics Simulations and Validation of Thermophysical
Properties of Polymers for Various Applications},
journal = {ACS Applied Polymer Materials},
volume = {3},
number = {2},
pages = {620-630},
year = {2021},
doi = {10.1021/acsapm.0c00524}}"
description:
"This paper outlines a MD simulation workflow based on GPU MD simulation
and the refined optimized potentials for liquid simulation (OPLS) OPLS3e force field to calculate glass transition temperatures (Tgs) of 315 polymers for which Bicerano reported experimental values."
- "@article{afzal2021, author = {Afzal, Mohammad Atif Faiz and Browning, Andrea R. and Goldberg, Alexander and Halls, Mathew D. and Gavartin, Jacob L. and Morisato, Tsuguo and Hughes, Thomas F. and Giesen, David J. and Goose, Joseph E.}, title = {High-Throughput Molecular Dynamics Simulations and Validation of Thermophysical Properties of Polymers for Various Applications}, journal = {ACS Applied Polymer Materials}, volume = {3}, number = {2}, pages = {620-630}, year = {2021}, doi = {10.1021/acsapm.0c00524}}"
description: "This paper outlines a MD simulation workflow based on GPU MD simulation and the refined optimized potentials for liquid simulation (OPLS) OPLS3e force field to calculate glass transition temperatures (Tgs) of 315 polymers for which Bicerano reported experimental values."
identifiers:
- description: PSMILES
id: PSMILES
Expand Down Expand Up @@ -81,8 +68,7 @@ templates:
Constraint: You must pick one of {%multiple_choice_enum%3%aA1}.
Options:
{Tg_exp#}
{Tg_exp%}
Answer:<EOI>{%multiple_choice_result}
- |-
Expand All @@ -94,8 +80,7 @@ templates:
Constraint: You must pick one of {%multiple_choice_enum%3%aA1}.
Options:
{Tg_calc#}
{Tg_calc%}
Answer:<EOI>{%multiple_choice_result}
- |-
Expand All @@ -106,10 +91,7 @@ templates:
Constraint: You must pick one of {%multiple_choice_enum%3%aA1}.
Options:
{%multiple_choice_enum%3%aA1}
{rho_300K_calc#}
{rho_300K_calc%}
Answer:<EOI>{%multiple_choice_result}
- |-
Expand All @@ -120,8 +102,7 @@ templates:
Constraint: You must pick one of {%multiple_choice_enum%3%aA1}.
Options:
{Tg_exp#}
{Tg_exp%}
Answer:<EOI>{%multiple_choice_result}
- |-
Expand All @@ -133,7 +114,6 @@ templates:
Constraint: You must pick one of {%multiple_choice_enum%3%aA1}.
Options:
{Tg_calc#}
{Tg_calc%}
Answer:<EOI>{%multiple_choice_result}
2 changes: 2 additions & 0 deletions data/tabular/bicerano_dataset/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def transform_data():

clean_data.columns = clean_columns

clean_data["compound_name"] = clean_data["compound_name"].str.strip()

clean_data["PSMILES"] = clean_data["PSMILES"].str.replace(
"[Ce]", "[*]", regex=False
)
Expand Down
2 changes: 2 additions & 0 deletions data/text_sampling/text_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,10 +664,12 @@ def get_sample_dict(self, sample: pd.Series, template: str):
multiple_choice_enum_idx = multiple_choice_enum_idx[0] # unpack list
multiple_choice_enum = input_variables[multiple_choice_enum_idx]


# get multiple_choice_var
multiple_choice_var_idx = [
i for i, x in enumerate(input_variables) if x.endswith("%")
]

assert len(multiple_choice_var_idx) == 1
multiple_choice_var_idx = multiple_choice_var_idx[0] # unpack list
multiple_choice_input = input_variables[multiple_choice_var_idx]
Expand Down
4 changes: 2 additions & 2 deletions data/train_test_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,8 +300,8 @@ def remaining_split(
file
for file in yaml_files
if not (
yaml_file_has_column_of_type(file, "SMILES")
or yaml_file_has_column_of_type(file, "AS_SEQUENCE")
yaml_file_has_column_of_type(file, "SMILES")
or yaml_file_has_column_of_type(file, "AS_SEQUENCE")
)
]

Expand Down
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,11 @@ dataset_creation = [
"pandarallel",
]


[project.scripts]
chemnlp-generate-meta = "chemnlp.data.meta_yaml_generator:cli"
chemnlp-augment-meta = "chemnlp.data.meta_yaml_augmenter:cli"
chemnlp-sample = "chemnlp.data.sampler_cli:cli"

chemlp-add-random-split-column = "chemnlp.data.utils:add_random_split_column_cli"

[tool.setuptools_scm]
version_scheme = "post-release"
6 changes: 4 additions & 2 deletions src/chemnlp/data/meta_yaml_augmentor.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def generate_augmented_meta_yaml(
return meta_yaml


def cli(data_dir: str, model: str = "gpt-4o", override: bool = False):
def _cli(data_dir: str, model: str = "gpt-4o", override: bool = False):
"""
Generate augmented meta.yaml for the given dataset.
Expand All @@ -152,6 +152,8 @@ def cli(data_dir: str, model: str = "gpt-4o", override: bool = False):

return augmented_meta_yaml

def cli():
fire.Fire(_cli)

if __name__ == "__main__":
fire.Fire(cli)
fire.Fire(_cli)
7 changes: 5 additions & 2 deletions src/chemnlp/data/meta_yaml_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def generate_meta_yaml(
return meta_yaml


def cli(
def _cli(
data_path: str,
dataset_name: str,
description: str,
Expand Down Expand Up @@ -195,6 +195,9 @@ def cli(
print("Failed to generate meta.yaml")


def cli():
fire.Fire(_cli)

# Example usage
if __name__ == "__main__":
fire.Fire(cli)
fire.Fire(_cli)
22 changes: 12 additions & 10 deletions src/chemnlp/data/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def _wrap_identifier(self, identifier: str, value: str) -> str:
except ValueError:
identifier_type = None

if identifier_type:
if identifier_type and identifier_type not in self.config.get('excluded_from_wrapping', []):
return f"[BEGIN_{identifier_type}]{value}[END_{identifier_type}]"
return value

Expand Down Expand Up @@ -806,22 +806,24 @@ def export(self, output_dir: str, template: str) -> pd.DataFrame:
# and we will export it as a single file
# otherwise, we will export the data based on the split
if "split" not in self.df.columns:
self.df['split'] = 'train'
logger.warning("No split column found in the data. Exporting as a single file.")
self.df["split"] = "train"
logger.warning(
"No split column found in the data. Exporting as a single file."
)
for split in self.df["split"].unique():
df_split = self.df[self.df["split"] == split]
samples = [self.sample(row, template) for _, row in df_split.iterrows()]

df_out = pd.DataFrame(samples)

if self.benchmarking_templates:
columns_to_keep = ["input", "output"]
if self.multiple_choice_benchmarking_templates:
columns_to_keep.extend(["answer_choices", "correct_output_index"])
else:
columns_to_keep = ["text"]
# if self.benchmarking_templates:
# columns_to_keep = ["input", "output", "text"]
# if self.multiple_choice_benchmarking_templates:
# columns_to_keep.extend(["answer_choices", "correct_output_index"])
# else:
# columns_to_keep = ["text"]

df_out = df_out[columns_to_keep]
# df_out = df_out[columns_to_keep]

output_path = os.path.join(output_dir, f"{split}.jsonl")
with open(output_path, "w") as f:
Expand Down
3 changes: 3 additions & 0 deletions src/chemnlp/data/sampler_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def process_dataset(
"multiple_choice_benchmarking_format": None,
"wrap_identifiers": wrap_identifiers,
"benchmarking_templates": benchmarking,
"excluded_from_wrapping": ['Other']
}

templates = meta["templates"]
Expand Down Expand Up @@ -171,6 +172,8 @@ def main(
wrap_identifiers,
)

def cli():
fire.Fire(main)

if __name__ == "__main__":
fire.Fire(main)
17 changes: 14 additions & 3 deletions src/chemnlp/data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
import yaml
from typing import Any


import fire
import numpy as np
import pandas as pd


def add_random_split_column(df):
# Calculate the number of rows for each split
Expand All @@ -14,16 +16,25 @@ def add_random_split_column(df):
n_valid = n_rows - n_train - n_test

# Create the split column
split = ['train'] * n_train + ['test'] * n_test + ['valid'] * n_valid
split = ["train"] * n_train + ["test"] * n_test + ["valid"] * n_valid

# Shuffle the split column
np.random.shuffle(split)

# Add the split column to the dataframe
df['split'] = split
df["split"] = split

return df

def _add_random_split_column(file):
df = pd.read_csv(file)
df = add_random_split_column(df)
df.to_csv(file, index=False)

def add_random_split_column_cli(file: str):
fire.Fire(_add_random_split_column)


def oxford_comma_join(items: List[str]) -> str:
"""Join a list of items with Oxford comma"""
if len(items) == 1:
Expand Down
2 changes: 2 additions & 0 deletions tests/data/test_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def sample_config():
"multiple_choice_rnd_symbols": ["", ".)", ")"],
"multiple_choice_benchmarking_templates": False,
"multiple_choice_benchmarking_format": None,
"excluded_from_wrapping": ['Other']
}


Expand All @@ -122,6 +123,7 @@ def sample_config_with_wrapping():
"multiple_choice_benchmarking_templates": False,
"multiple_choice_benchmarking_format": None,
"wrap_identifiers": True,
"excluded_from_wrapping": ['Other']
}


Expand Down

0 comments on commit 0604d7c

Please sign in to comment.