Data processing bug (#4)

adil-a · web-flow · commit 4a1323ba2f5b · 2024-01-23T11:55:56.000-05:00
* added peft req

* added a dev-requirements file and fixed formatting

* added functionality to reset dataloader after epoch finishes

* Mistral training now works (padding token was incorrectly set)

* added config ignore

* reverting gitignore

* two bugs. one, labels for bos/eos tokens weren't added properly if there was a sequence separator. two, the attention mask had the eos/bos token id instead of a [1] for those tokens specifically

* added gotcha for separator in data preprocessing

* fixed small bug in truncation where eos token should be added after truncation

* fixed small bug in truncation where eos token should be added after truncation
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 *.out
 *.err
 *.egg-info
+**/*.sh
 __pycache__/
 wandb/
 build/
diff --git a/docs/config.md b/docs/config.md
@@ -65,7 +65,7 @@ Similar to the wandb config above, these keyword parameters are fed directly int
 * `overlap`: When we chunk a data point during packing, we can choose to have some overlap between the current chunk and the next chunk. This might help the model understand surrounding context during training (although this isn't something we have empirically investigated, we keep this option available to users).
 * `add_bos_eos_tokens`: Whether to add `BOS` and `EOS` tokens as defined by the respective HuggingFace tokenizer. If using packing, these will be added after packing is done, so that each chunk of size `max_seq_len` has these tokens.
 * `from_disk`: Whether we are going to be loading the dataset to preprocess from disk (the other option is to download straight from HuggingFace).
-* `seperator`: If using conditional finetuning (i.e. in a given data point, everything before `separator` will not be used for calculating the loss and its labels will be `ignore_index`).
+* `seperator`: If using conditional finetuning (i.e. in a given data point, everything before `separator` will not be used for calculating the loss and its labels will be `ignore_index`). **Note:** if `separator` is not found in a given sequence, the default behavior is that datapoint will be skipped and not be a part of the final set.
 * `load_path`: The directory containing the HuggingFace dataset we are loading to preprocess.
 * `split`: If `load_path` is a dataset dictionary, `split` specifies which key in this dictionary contains the dataset we are preprocessing.
 * `save_path`: The directory we will be saving the processed dataset to.
diff --git a/preprocess_data.py b/preprocess_data.py
@@ -93,21 +93,28 @@ def tokenize_dataset(
     all_input_ids = []
     all_attention_mask = []
     all_labels = []
+    # Adding bos/eos
     if add_bos_eos:
         bos, eos = tokenizer.bos_token, tokenizer.eos_token
     else:
         bos, eos = "", ""
     for example in examples[data_field]:
+        # If we want to include a prepended prompt to each datapoint
         if pre_pend:
             prompt = f"{bos}{pre_pend}{example}{eos}"
         else:
             prompt = f"{bos}{example}{eos}"
+        # If we've specified a separator present in each sequence
         if not separator:
             tokenized = tokenizer.encode(prompt, add_special_tokens=False)
-            if truncate and len(tokenized) > tokenizer.max_model_length:
-                tokenized = tokenized[:tokenizer.max_model_length]
+            if truncate and len(tokenized) > tokenizer.model_max_length:
+                tokenized = tokenized[:tokenizer.model_max_length - 1]
+                tokenized.append(tokenizer.eos_token_id)
             all_labels.append(deepcopy(tokenized))
         else:
+            if separator not in prompt:
+                continue
+            # Perform tokenization separately to allow for conditional prompting
             separation_idx = prompt.find(separator) + len(separator)
             prefix, postfix = prompt[:separation_idx], prompt[separation_idx:]
             tokenized_prefix = tokenizer.encode(
@@ -117,12 +124,27 @@ def tokenize_dataset(
                 postfix, add_special_tokens=False,
             )
             tokenized = tokenized_prefix + tokenized_postfix
-            if truncate and len(tokenized) > tokenizer.max_model_length:
-                tokenized = tokenized[:tokenizer.max_model_length]
-            tokenized = tokenized_prefix + tokenized_postfix
-            all_labels.append(
-                [-100] * len(tokenized_prefix) + deepcopy(tokenized_postfix),
-            )
+            if truncate and len(tokenized) > tokenizer.model_max_length:
+                tokenized = tokenized[:tokenizer.model_max_length - 1]
+                tokenized.append(tokenizer.eos_token_id)
+            # We need to address this separately, because labels need to
+            # backprop on bos/eos tokens
+            if add_bos_eos:
+                label = (
+                    [tokenizer.bos_token_id]
+                    + ([-100] * (len(tokenized_prefix) - 1))
+                    + deepcopy(tokenized_postfix)
+                )
+            else:
+                label = (
+                    [-100] * len(tokenized_prefix)
+                    + deepcopy(tokenized_postfix)
+                )
+            # If truncated, labels should be the same.
+            if truncate and len(label) > tokenizer.model_max_length:
+                label = label[:tokenizer.model_max_length - 1]
+                label.append(tokenizer.eos_token_id)
+            all_labels.append(label)
         all_input_ids.append(tokenized)
         all_attention_mask.append([1] * len(tokenized))
 
@@ -160,7 +182,8 @@ def pack_examples(
     """
     chunk_size = tokenizer.model_max_length
     if add_bos_eos:
-        chunk_size -= 2  # For BOS and EOS tokens.
+        # For BOS and EOS tokens.
+        chunk_size -= 2
         bos, eos = [tokenizer.bos_token_id], [tokenizer.eos_token_id]
     else:
         bos, eos = [], []
@@ -169,25 +192,42 @@ def pack_examples(
     if packing_type == "full":
         joined_examples = {k: sum(examples[k], []) for k in all_keys}
         total_length = len(joined_examples["input_ids"])
-        result = {
-            k: [
-                bos + v[i:i + chunk_size] + eos for i in range(
-                    0, total_length, stride,
-                )
-            ] for k, v in joined_examples.items()
-        }
+        result = {}
+        for k, v in joined_examples.items():
+            value_chunked_lst = []
+            for i in range(0, total_length, stride):
+                if k != "attention_mask":
+                    value_chunked_lst.append(bos + v[i:i + chunk_size] + eos)
+                else:
+                    if add_bos_eos:
+                        # Need to do this explicitly because attention mask
+                        # is just 1s or 0s.
+                        value_chunked_lst.append(
+                            [1] + v[i:i + chunk_size] + [1]
+                        )
+                    else:
+                        value_chunked_lst.append(v[i:i + chunk_size])
     elif packing_type == "partial":
         result = {k:[] for k in examples}
         _key = all_keys[0]
         for idx in range(len(examples[_key])):
             total_length = len(examples[_key][idx])
             for key in all_keys:
-                sliced_example = [
-                    (
-                        bos + examples[key][idx][i:i + chunk_size] + eos
-                    ) for i in range(0, total_length, stride)
-                ]
-                result[key].extend(sliced_example)
+                for i in range(0, total_length, stride):
+                    if key != "attention_mask":
+                        sliced_example = [
+                            bos + examples[key][idx][i:i + chunk_size] + eos
+                        ]
+                    else:
+                        if add_bos_eos:
+                            sliced_example = [
+                                [1] + examples[key][idx][i:i + chunk_size] + [1]
+                            ]
+                        else:
+                            sliced_example = [
+                                examples[key][idx][i:i + chunk_size]
+                            ]
+                    result[key].extend(sliced_example)
     else:
         msg = "`packing_type` needs to either be `full` or `partial`."
         raise ValueError(msg)
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="vectorlm",
-    version="1.0",
+    version="0.1.0",
     packages=find_packages(),
     install_requires=requirements,
     python_requires=">=3.10",

-Original file line number
+Diff line change
@@ @@ -1,6 +1,7 @@ @@
 *.out
 *.err
 *.egg-info
 +**/*.sh
 __pycache__/
 wandb/
 build/