Skip to content

Commit 4a1323b

Browse files
authored
Data processing bug (#4)
* added peft req * added a dev-requirements file and fixed formatting * added functionality to reset dataloader after epoch finishes * Mistral training now works (padding token was incorrectly set) * added config ignore * reverting gitignore * two bugs. one, labels for bos/eos tokens weren't added properly if there was a sequence separator. two, the attention mask had the eos/bos token id instead of a [1] for those tokens specifically * added gotcha for separator in data preprocessing * fixed small bug in truncation where eos token should be added after truncation * fixed small bug in truncation where eos token should be added after truncation
1 parent 8d62dd3 commit 4a1323b

File tree

4 files changed

+65
-24
lines changed

4 files changed

+65
-24
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
*.out
22
*.err
33
*.egg-info
4+
**/*.sh
45
__pycache__/
56
wandb/
67
build/

docs/config.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ Similar to the wandb config above, these keyword parameters are fed directly int
6565
* `overlap`: When we chunk a data point during packing, we can choose to have some overlap between the current chunk and the next chunk. This might help the model understand surrounding context during training (although this isn't something we have empirically investigated, we keep this option available to users).
6666
* `add_bos_eos_tokens`: Whether to add `BOS` and `EOS` tokens as defined by the respective HuggingFace tokenizer. If using packing, these will be added after packing is done, so that each chunk of size `max_seq_len` has these tokens.
6767
* `from_disk`: Whether we are going to be loading the dataset to preprocess from disk (the other option is to download straight from HuggingFace).
68-
* `seperator`: If using conditional finetuning (i.e. in a given data point, everything before `separator` will not be used for calculating the loss and its labels will be `ignore_index`).
68+
* `seperator`: If using conditional finetuning (i.e. in a given data point, everything before `separator` will not be used for calculating the loss and its labels will be `ignore_index`). **Note:** if `separator` is not found in a given sequence, the default behavior is that datapoint will be skipped and not be a part of the final set.
6969
* `load_path`: The directory containing the HuggingFace dataset we are loading to preprocess.
7070
* `split`: If `load_path` is a dataset dictionary, `split` specifies which key in this dictionary contains the dataset we are preprocessing.
7171
* `save_path`: The directory we will be saving the processed dataset to.

preprocess_data.py

Lines changed: 62 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -93,21 +93,28 @@ def tokenize_dataset(
9393
all_input_ids = []
9494
all_attention_mask = []
9595
all_labels = []
96+
# Adding bos/eos
9697
if add_bos_eos:
9798
bos, eos = tokenizer.bos_token, tokenizer.eos_token
9899
else:
99100
bos, eos = "", ""
100101
for example in examples[data_field]:
102+
# If we want to include a prepended prompt to each datapoint
101103
if pre_pend:
102104
prompt = f"{bos}{pre_pend}{example}{eos}"
103105
else:
104106
prompt = f"{bos}{example}{eos}"
107+
# If we've specified a separator present in each sequence
105108
if not separator:
106109
tokenized = tokenizer.encode(prompt, add_special_tokens=False)
107-
if truncate and len(tokenized) > tokenizer.max_model_length:
108-
tokenized = tokenized[:tokenizer.max_model_length]
110+
if truncate and len(tokenized) > tokenizer.model_max_length:
111+
tokenized = tokenized[:tokenizer.model_max_length - 1]
112+
tokenized.append(tokenizer.eos_token_id)
109113
all_labels.append(deepcopy(tokenized))
110114
else:
115+
if separator not in prompt:
116+
continue
117+
# Perform tokenization separately to allow for conditional prompting
111118
separation_idx = prompt.find(separator) + len(separator)
112119
prefix, postfix = prompt[:separation_idx], prompt[separation_idx:]
113120
tokenized_prefix = tokenizer.encode(
@@ -117,12 +124,27 @@ def tokenize_dataset(
117124
postfix, add_special_tokens=False,
118125
)
119126
tokenized = tokenized_prefix + tokenized_postfix
120-
if truncate and len(tokenized) > tokenizer.max_model_length:
121-
tokenized = tokenized[:tokenizer.max_model_length]
122-
tokenized = tokenized_prefix + tokenized_postfix
123-
all_labels.append(
124-
[-100] * len(tokenized_prefix) + deepcopy(tokenized_postfix),
125-
)
127+
if truncate and len(tokenized) > tokenizer.model_max_length:
128+
tokenized = tokenized[:tokenizer.model_max_length - 1]
129+
tokenized.append(tokenizer.eos_token_id)
130+
# We need to address this separately, because labels need to
131+
# backprop on bos/eos tokens
132+
if add_bos_eos:
133+
label = (
134+
[tokenizer.bos_token_id]
135+
+ ([-100] * (len(tokenized_prefix) - 1))
136+
+ deepcopy(tokenized_postfix)
137+
)
138+
else:
139+
label = (
140+
[-100] * len(tokenized_prefix)
141+
+ deepcopy(tokenized_postfix)
142+
)
143+
# If truncated, labels should be the same.
144+
if truncate and len(label) > tokenizer.model_max_length:
145+
label = label[:tokenizer.model_max_length - 1]
146+
label.append(tokenizer.eos_token_id)
147+
all_labels.append(label)
126148
all_input_ids.append(tokenized)
127149
all_attention_mask.append([1] * len(tokenized))
128150

@@ -160,7 +182,8 @@ def pack_examples(
160182
"""
161183
chunk_size = tokenizer.model_max_length
162184
if add_bos_eos:
163-
chunk_size -= 2 # For BOS and EOS tokens.
185+
# For BOS and EOS tokens.
186+
chunk_size -= 2
164187
bos, eos = [tokenizer.bos_token_id], [tokenizer.eos_token_id]
165188
else:
166189
bos, eos = [], []
@@ -169,25 +192,42 @@ def pack_examples(
169192
if packing_type == "full":
170193
joined_examples = {k: sum(examples[k], []) for k in all_keys}
171194
total_length = len(joined_examples["input_ids"])
172-
result = {
173-
k: [
174-
bos + v[i:i + chunk_size] + eos for i in range(
175-
0, total_length, stride,
176-
)
177-
] for k, v in joined_examples.items()
178-
}
195+
result = {}
196+
for k, v in joined_examples.items():
197+
value_chunked_lst = []
198+
for i in range(0, total_length, stride):
199+
if k != "attention_mask":
200+
value_chunked_lst.append(bos + v[i:i + chunk_size] + eos)
201+
else:
202+
if add_bos_eos:
203+
# Need to do this explicitly because attention mask
204+
# is just 1s or 0s.
205+
value_chunked_lst.append(
206+
[1] + v[i:i + chunk_size] + [1]
207+
)
208+
else:
209+
value_chunked_lst.append(v[i:i + chunk_size])
179210
elif packing_type == "partial":
180211
result = {k:[] for k in examples}
181212
_key = all_keys[0]
182213
for idx in range(len(examples[_key])):
183214
total_length = len(examples[_key][idx])
184215
for key in all_keys:
185-
sliced_example = [
186-
(
187-
bos + examples[key][idx][i:i + chunk_size] + eos
188-
) for i in range(0, total_length, stride)
189-
]
190-
result[key].extend(sliced_example)
216+
for i in range(0, total_length, stride):
217+
if key != "attention_mask":
218+
sliced_example = [
219+
bos + examples[key][idx][i:i + chunk_size] + eos
220+
]
221+
else:
222+
if add_bos_eos:
223+
sliced_example = [
224+
[1] + examples[key][idx][i:i + chunk_size] + [1]
225+
]
226+
else:
227+
sliced_example = [
228+
examples[key][idx][i:i + chunk_size]
229+
]
230+
result[key].extend(sliced_example)
191231
else:
192232
msg = "`packing_type` needs to either be `full` or `partial`."
193233
raise ValueError(msg)

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setup(
77
name="vectorlm",
8-
version="1.0",
8+
version="0.1.0",
99
packages=find_packages(),
1010
install_requires=requirements,
1111
python_requires=">=3.10",

0 commit comments

Comments
 (0)