From 7caa106bd688fc94ca8dede24ad27b6553d0a795 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Mon, 11 Nov 2024 16:22:35 -0500 Subject: [PATCH 1/5] Fixing phase10 data mixing to include knowledge pretraining Signed-off-by: Aakanksha Duggal (cherry picked from commit 98c0fab80fa4afd069d66ac02c7163356b94f3f8) --- src/instructlab/sdg/datamixing.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py index 3cab0156..4ceaf90a 100644 --- a/src/instructlab/sdg/datamixing.py +++ b/src/instructlab/sdg/datamixing.py @@ -438,7 +438,7 @@ def __create_auxiliary_ds(rec): def _create_phase10_ds( - generated_dataset: Dataset, auxiliary_inst: Optional[Dict[str, List[str]]] + generated_dataset: Dataset, auxiliary_inst: Optional[Dict[str, List[str]]], use_legacy_pretraining_format: bool, ): """ Create a dataset for Phase 1.0 of downstream training. @@ -450,7 +450,12 @@ def _create_phase10_ds( knowledge_ds = _generate_knowledge_qa_dataset( generated_dataset, keep_context_separate=True ) - knowledge_ds = _add_extra_contexts_to_samples(knowledge_ds, p=0.4) + raft_knowledge_ds = _add_extra_contexts_to_samples(knowledge_ds, p=0.4) + pretraining_knowledge_ds = _generate_knowledge_qa_dataset( + generated_dataset, keep_context_separate=False + ).map( + lambda rec: _conv_pretrain(rec, use_legacy_pretraining_format) + ) auxiliary_dataset = _create_auxiliary_dataset(generated_dataset, auxiliary_inst) if auxiliary_dataset is not None: From 14da6c7bd29a3156a054ef0591b649c77937b63c Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Mon, 11 Nov 2024 16:44:55 -0500 Subject: [PATCH 2/5] Add phase07 pretraining data to phase10 Signed-off-by: Aakanksha Duggal (cherry picked from commit 5667ac918e0d6ce05cf8782ed5a2154f66794c3f) --- src/instructlab/sdg/datamixing.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py index 4ceaf90a..b84f5e46 100644 --- a/src/instructlab/sdg/datamixing.py +++ b/src/instructlab/sdg/datamixing.py @@ -451,6 +451,7 @@ def _create_phase10_ds( generated_dataset, keep_context_separate=True ) raft_knowledge_ds = _add_extra_contexts_to_samples(knowledge_ds, p=0.4) + # Include phase07 pretraining_knowledge_ds = _generate_knowledge_qa_dataset( generated_dataset, keep_context_separate=False ).map( @@ -458,10 +459,11 @@ def _create_phase10_ds( ) auxiliary_dataset = _create_auxiliary_dataset(generated_dataset, auxiliary_inst) + if auxiliary_dataset is not None: - phase10 = concatenate_datasets([knowledge_ds, auxiliary_dataset]) + phase10 = concatenate_datasets([raft_knowledge_ds, pretraining_knowledge_ds, auxiliary_dataset]) else: - phase10 = knowledge_ds + phase10 = concatenate_datasets([raft_knowledge_ds, pretraining_knowledge_ds]) return phase10 From bf4af43be6c8063b59279919d2342250006625e0 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Mon, 11 Nov 2024 17:20:52 -0500 Subject: [PATCH 3/5] Update function call to include the new param - use_legacy_pretraining_format Signed-off-by: Aakanksha Duggal (cherry picked from commit cd0b73af9073ef0d1c4d3e063adc1259c0487104) --- src/instructlab/sdg/datamixing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py index b84f5e46..47975528 100644 --- a/src/instructlab/sdg/datamixing.py +++ b/src/instructlab/sdg/datamixing.py @@ -589,7 +589,7 @@ def collect(self, leaf_node_path, new_generated_data, is_knowledge): ) skills_phase_data = _create_phase10_ds( - new_generated_data, self.auxiliary_inst + new_generated_data, self.auxiliary_inst, use_legacy_pretraining_format ) output_file_leaf_skills = ( f"node_datasets_{self.date_suffix}/{leaf_node_path}_p10.jsonl" From f7a840e5512f1dc268ac1493de9dca4c9b01c5ab Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Mon, 11 Nov 2024 20:18:01 -0500 Subject: [PATCH 4/5] address mypy issues Signed-off-by: Aakanksha Duggal (cherry picked from commit 3040657b9155fd8cef9dd99e73624c23073f7a6f) --- src/instructlab/sdg/datamixing.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py index 47975528..a18738f7 100644 --- a/src/instructlab/sdg/datamixing.py +++ b/src/instructlab/sdg/datamixing.py @@ -438,7 +438,9 @@ def __create_auxiliary_ds(rec): def _create_phase10_ds( - generated_dataset: Dataset, auxiliary_inst: Optional[Dict[str, List[str]]], use_legacy_pretraining_format: bool, + generated_dataset: Dataset, + auxiliary_inst: Optional[Dict[str, List[str]]], + use_legacy_pretraining_format: bool, ): """ Create a dataset for Phase 1.0 of downstream training. @@ -454,14 +456,14 @@ def _create_phase10_ds( # Include phase07 pretraining_knowledge_ds = _generate_knowledge_qa_dataset( generated_dataset, keep_context_separate=False - ).map( - lambda rec: _conv_pretrain(rec, use_legacy_pretraining_format) - ) + ).map(lambda rec: _conv_pretrain(rec, use_legacy_pretraining_format)) auxiliary_dataset = _create_auxiliary_dataset(generated_dataset, auxiliary_inst) if auxiliary_dataset is not None: - phase10 = concatenate_datasets([raft_knowledge_ds, pretraining_knowledge_ds, auxiliary_dataset]) + phase10 = concatenate_datasets( + [raft_knowledge_ds, pretraining_knowledge_ds, auxiliary_dataset] + ) else: phase10 = concatenate_datasets([raft_knowledge_ds, pretraining_knowledge_ds]) return phase10 From 625d9abb6e09ef415656ac2c876ade18bef838d1 Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Tue, 12 Nov 2024 10:25:25 -0500 Subject: [PATCH 5/5] Adjust backport to remove `use_legacy_pretraining_format` Older versions of SDG did not have the `use_legacy_pretraining_format` parameter, so this just removes that from the backported code. Signed-off-by: Ben Browning --- src/instructlab/sdg/datamixing.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py index a18738f7..7c0caca0 100644 --- a/src/instructlab/sdg/datamixing.py +++ b/src/instructlab/sdg/datamixing.py @@ -440,7 +440,6 @@ def __create_auxiliary_ds(rec): def _create_phase10_ds( generated_dataset: Dataset, auxiliary_inst: Optional[Dict[str, List[str]]], - use_legacy_pretraining_format: bool, ): """ Create a dataset for Phase 1.0 of downstream training. @@ -456,7 +455,7 @@ def _create_phase10_ds( # Include phase07 pretraining_knowledge_ds = _generate_knowledge_qa_dataset( generated_dataset, keep_context_separate=False - ).map(lambda rec: _conv_pretrain(rec, use_legacy_pretraining_format)) + ).map(_conv_pretrain) auxiliary_dataset = _create_auxiliary_dataset(generated_dataset, auxiliary_inst) @@ -591,7 +590,7 @@ def collect(self, leaf_node_path, new_generated_data, is_knowledge): ) skills_phase_data = _create_phase10_ds( - new_generated_data, self.auxiliary_inst, use_legacy_pretraining_format + new_generated_data, self.auxiliary_inst ) output_file_leaf_skills = ( f"node_datasets_{self.date_suffix}/{leaf_node_path}_p10.jsonl"