add command line scripts

OpenBioML · Aug 14, 2024 · 0604d7c · 0604d7c
1 parent 9f1ec22
commit 0604d7c
Show file tree

Hide file tree

Showing 12 changed files with 56 additions and 48 deletions.
diff --git a/.gitignore b/.gitignore
@@ -146,3 +146,5 @@ scratch/
 *.swp
 
 .DS_Store
+sampled_data/
+sampled_benchmark/
diff --git a/data/tabular/bicerano_dataset/meta.yaml b/data/tabular/bicerano_dataset/meta.yaml
@@ -1,19 +1,6 @@
 bibtex:
-  - "@article{afzal2021,
-    author = {Afzal, Mohammad Atif Faiz and Browning, Andrea R. and Goldberg, Alexander
-    and Halls, Mathew D. and Gavartin, Jacob L. and Morisato,
-    Tsuguo and Hughes, Thomas F. and Giesen, David J. and Goose, Joseph E.},
-    title = {High-Throughput Molecular Dynamics Simulations and Validation of Thermophysical
-    Properties of Polymers for Various Applications},
-    journal = {ACS Applied Polymer Materials},
-    volume = {3},
-    number = {2},
-    pages = {620-630},
-    year = {2021},
-    doi = {10.1021/acsapm.0c00524}}"
-description:
-  "This paper outlines a MD simulation workflow based on GPU MD simulation
-  and the refined optimized potentials for liquid simulation (OPLS) OPLS3e force field to calculate glass transition temperatures (Tgs) of 315 polymers for which Bicerano reported experimental values."
+  - "@article{afzal2021, author = {Afzal, Mohammad Atif Faiz and Browning, Andrea R. and Goldberg, Alexander and Halls, Mathew D. and Gavartin, Jacob L. and Morisato, Tsuguo and Hughes, Thomas F. and Giesen, David J. and Goose, Joseph E.}, title = {High-Throughput Molecular Dynamics Simulations and Validation of Thermophysical Properties of Polymers for Various Applications}, journal = {ACS Applied Polymer Materials}, volume = {3}, number = {2}, pages = {620-630}, year = {2021}, doi = {10.1021/acsapm.0c00524}}"
+description: "This paper outlines a MD simulation workflow based on GPU MD simulation and the refined optimized potentials for liquid simulation (OPLS) OPLS3e force field to calculate glass transition temperatures (Tgs) of 315 polymers for which Bicerano reported experimental values."
 identifiers:
   - description: PSMILES
     id: PSMILES
@@ -81,8 +68,7 @@ templates:
     Constraint: You must pick one of {%multiple_choice_enum%3%aA1}.
 
     Options:
-
-    {Tg_exp#}
+    {Tg_exp%}
 
     Answer:<EOI>{%multiple_choice_result}
   - |-
@@ -94,8 +80,7 @@ templates:
     Constraint: You must pick one of {%multiple_choice_enum%3%aA1}.
 
     Options:
-
-    {Tg_calc#}
+    {Tg_calc%}
 
     Answer:<EOI>{%multiple_choice_result}
   - |-
@@ -106,10 +91,7 @@ templates:
     Constraint: You must pick one of {%multiple_choice_enum%3%aA1}.
 
     Options:
-
-    {%multiple_choice_enum%3%aA1}
-
-    {rho_300K_calc#}
+    {rho_300K_calc%}
 
     Answer:<EOI>{%multiple_choice_result}
   - |-
@@ -120,8 +102,7 @@ templates:
     Constraint: You must pick one of {%multiple_choice_enum%3%aA1}.
 
     Options:
-
-    {Tg_exp#}
+    {Tg_exp%}
 
     Answer:<EOI>{%multiple_choice_result}
   - |-
@@ -133,7 +114,6 @@ templates:
     Constraint: You must pick one of {%multiple_choice_enum%3%aA1}.
 
     Options:
-
-    {Tg_calc#}
+    {Tg_calc%}
 
     Answer:<EOI>{%multiple_choice_result}
diff --git a/data/tabular/bicerano_dataset/transform.py b/data/tabular/bicerano_dataset/transform.py
@@ -31,6 +31,8 @@ def transform_data():
 
     clean_data.columns = clean_columns
 
+    clean_data["compound_name"] = clean_data["compound_name"].str.strip()
+
     clean_data["PSMILES"] = clean_data["PSMILES"].str.replace(
         "[Ce]", "[*]", regex=False
     )

diff --git a/data/text_sampling/text_sampling.py b/data/text_sampling/text_sampling.py
@@ -664,10 +664,12 @@ def get_sample_dict(self, sample: pd.Series, template: str):
             multiple_choice_enum_idx = multiple_choice_enum_idx[0]  # unpack list
             multiple_choice_enum = input_variables[multiple_choice_enum_idx]
 
+
             # get multiple_choice_var
             multiple_choice_var_idx = [
                 i for i, x in enumerate(input_variables) if x.endswith("%")
             ]
+
             assert len(multiple_choice_var_idx) == 1
             multiple_choice_var_idx = multiple_choice_var_idx[0]  # unpack list
             multiple_choice_input = input_variables[multiple_choice_var_idx]

diff --git a/data/train_test_split.py b/data/train_test_split.py
@@ -300,8 +300,8 @@ def remaining_split(
         file
         for file in yaml_files
         if not (
-           yaml_file_has_column_of_type(file, "SMILES")
-           or yaml_file_has_column_of_type(file, "AS_SEQUENCE")
+            yaml_file_has_column_of_type(file, "SMILES")
+            or yaml_file_has_column_of_type(file, "AS_SEQUENCE")
         )
     ]
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -30,12 +30,11 @@ dataset_creation = [
     "pandarallel",
 ]
 
-
 [project.scripts]
 chemnlp-generate-meta = "chemnlp.data.meta_yaml_generator:cli"
 chemnlp-augment-meta = "chemnlp.data.meta_yaml_augmenter:cli"
 chemnlp-sample = "chemnlp.data.sampler_cli:cli"
-
+chemlp-add-random-split-column = "chemnlp.data.utils:add_random_split_column_cli"
 
 [tool.setuptools_scm]
 version_scheme = "post-release"
diff --git a/src/chemnlp/data/meta_yaml_augmentor.py b/src/chemnlp/data/meta_yaml_augmentor.py
@@ -128,7 +128,7 @@ def generate_augmented_meta_yaml(
     return meta_yaml
 
 
-def cli(data_dir: str, model: str = "gpt-4o", override: bool = False):
+def _cli(data_dir: str, model: str = "gpt-4o", override: bool = False):
     """
     Generate augmented meta.yaml for the given dataset.
 
@@ -152,6 +152,8 @@ def cli(data_dir: str, model: str = "gpt-4o", override: bool = False):
 
     return augmented_meta_yaml
 
+def cli():
+    fire.Fire(_cli)
 
 if __name__ == "__main__":
-    fire.Fire(cli)
+    fire.Fire(_cli)
diff --git a/src/chemnlp/data/meta_yaml_generator.py b/src/chemnlp/data/meta_yaml_generator.py
@@ -159,7 +159,7 @@ def generate_meta_yaml(
     return meta_yaml
 
 
-def cli(
+def _cli(
     data_path: str,
     dataset_name: str,
     description: str,
@@ -195,6 +195,9 @@ def cli(
         print("Failed to generate meta.yaml")
 
 
+def cli():
+    fire.Fire(_cli)
+
 # Example usage
 if __name__ == "__main__":
-    fire.Fire(cli)
+    fire.Fire(_cli)
diff --git a/src/chemnlp/data/sampler.py b/src/chemnlp/data/sampler.py
@@ -187,7 +187,7 @@ def _wrap_identifier(self, identifier: str, value: str) -> str:
         except ValueError:
             identifier_type = None
 
-        if identifier_type:
+        if identifier_type and identifier_type not in self.config.get('excluded_from_wrapping', []):
             return f"[BEGIN_{identifier_type}]{value}[END_{identifier_type}]"
         return value
 
@@ -806,22 +806,24 @@ def export(self, output_dir: str, template: str) -> pd.DataFrame:
         # and we will export it as a single file
         # otherwise, we will export the data based on the split
         if "split" not in self.df.columns:
-            self.df['split'] = 'train'
-            logger.warning("No split column found in the data. Exporting as a single file.")
+            self.df["split"] = "train"
+            logger.warning(
+                "No split column found in the data. Exporting as a single file."
+            )
         for split in self.df["split"].unique():
             df_split = self.df[self.df["split"] == split]
             samples = [self.sample(row, template) for _, row in df_split.iterrows()]
 
             df_out = pd.DataFrame(samples)
 
-            if self.benchmarking_templates:
-                columns_to_keep = ["input", "output"]
-                if self.multiple_choice_benchmarking_templates:
-                    columns_to_keep.extend(["answer_choices", "correct_output_index"])
-            else:
-                columns_to_keep = ["text"]
+            # if self.benchmarking_templates:
+            #     columns_to_keep = ["input", "output", "text"]
+            #     if self.multiple_choice_benchmarking_templates:
+            #         columns_to_keep.extend(["answer_choices", "correct_output_index"])
+            # else:
+            #     columns_to_keep = ["text"]
 
-            df_out = df_out[columns_to_keep]
+            # df_out = df_out[columns_to_keep]
 
             output_path = os.path.join(output_dir, f"{split}.jsonl")
             with open(output_path, "w") as f:

diff --git a/src/chemnlp/data/sampler_cli.py b/src/chemnlp/data/sampler_cli.py
@@ -93,6 +93,7 @@ def process_dataset(
         "multiple_choice_benchmarking_format": None,
         "wrap_identifiers": wrap_identifiers,
         "benchmarking_templates": benchmarking,
+        "excluded_from_wrapping": ['Other']
     }
 
     templates = meta["templates"]
@@ -171,6 +172,8 @@ def main(
         wrap_identifiers,
     )
 
+def cli():
+    fire.Fire(main)
 
 if __name__ == "__main__":
     fire.Fire(main)
diff --git a/src/chemnlp/data/utils.py b/src/chemnlp/data/utils.py
@@ -3,8 +3,10 @@
 import yaml
 from typing import Any
 
-
+import fire
 import numpy as np
+import pandas as pd
+
 
 def add_random_split_column(df):
     # Calculate the number of rows for each split
@@ -14,16 +16,25 @@ def add_random_split_column(df):
     n_valid = n_rows - n_train - n_test
 
     # Create the split column
-    split = ['train'] * n_train + ['test'] * n_test + ['valid'] * n_valid
+    split = ["train"] * n_train + ["test"] * n_test + ["valid"] * n_valid
 
     # Shuffle the split column
     np.random.shuffle(split)
 
     # Add the split column to the dataframe
-    df['split'] = split
+    df["split"] = split
 
     return df
 
+def _add_random_split_column(file):
+    df = pd.read_csv(file)
+    df = add_random_split_column(df)
+    df.to_csv(file, index=False)
+
+def add_random_split_column_cli(file: str):
+    fire.Fire(_add_random_split_column)
+
+
 def oxford_comma_join(items: List[str]) -> str:
     """Join a list of items with Oxford comma"""
     if len(items) == 1:

diff --git a/tests/data/test_sampler.py b/tests/data/test_sampler.py
@@ -111,6 +111,7 @@ def sample_config():
         "multiple_choice_rnd_symbols": ["", ".)", ")"],
         "multiple_choice_benchmarking_templates": False,
         "multiple_choice_benchmarking_format": None,
+                "excluded_from_wrapping": ['Other']
     }
 
 
@@ -122,6 +123,7 @@ def sample_config_with_wrapping():
         "multiple_choice_benchmarking_templates": False,
         "multiple_choice_benchmarking_format": None,
         "wrap_identifiers": True,
+                "excluded_from_wrapping": ['Other']
     }