From 98c0fab80fa4afd069d66ac02c7163356b94f3f8 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Mon, 11 Nov 2024 16:22:35 -0500 Subject: [PATCH 1/4] Fixing phase10 data mixing to include knowledge pretraining Signed-off-by: Aakanksha Duggal --- src/instructlab/sdg/datamixing.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py index 2a4f3923..7e140941 100644 --- a/src/instructlab/sdg/datamixing.py +++ b/src/instructlab/sdg/datamixing.py @@ -445,7 +445,7 @@ def __create_auxiliary_ds(rec): def _create_phase10_ds( - generated_dataset: Dataset, auxiliary_inst: Optional[Dict[str, List[str]]] + generated_dataset: Dataset, auxiliary_inst: Optional[Dict[str, List[str]]], use_legacy_pretraining_format: bool, ): """ Create a dataset for Phase 1.0 of downstream training. @@ -457,7 +457,12 @@ def _create_phase10_ds( knowledge_ds = _generate_knowledge_qa_dataset( generated_dataset, keep_context_separate=True ) - knowledge_ds = _add_extra_contexts_to_samples(knowledge_ds, p=0.4) + raft_knowledge_ds = _add_extra_contexts_to_samples(knowledge_ds, p=0.4) + pretraining_knowledge_ds = _generate_knowledge_qa_dataset( + generated_dataset, keep_context_separate=False + ).map( + lambda rec: _conv_pretrain(rec, use_legacy_pretraining_format) + ) auxiliary_dataset = _create_auxiliary_dataset(generated_dataset, auxiliary_inst) if auxiliary_dataset is not None: From 5667ac918e0d6ce05cf8782ed5a2154f66794c3f Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Mon, 11 Nov 2024 16:44:55 -0500 Subject: [PATCH 2/4] Add phase07 pretraining data to phase10 Signed-off-by: Aakanksha Duggal --- src/instructlab/sdg/datamixing.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py index 7e140941..63d20fbb 100644 --- a/src/instructlab/sdg/datamixing.py +++ b/src/instructlab/sdg/datamixing.py @@ -458,6 +458,7 @@ def _create_phase10_ds( generated_dataset, keep_context_separate=True ) raft_knowledge_ds = _add_extra_contexts_to_samples(knowledge_ds, p=0.4) + # Include phase07 pretraining_knowledge_ds = _generate_knowledge_qa_dataset( generated_dataset, keep_context_separate=False ).map( @@ -465,10 +466,11 @@ def _create_phase10_ds( ) auxiliary_dataset = _create_auxiliary_dataset(generated_dataset, auxiliary_inst) + if auxiliary_dataset is not None: - phase10 = concatenate_datasets([knowledge_ds, auxiliary_dataset]) + phase10 = concatenate_datasets([raft_knowledge_ds, pretraining_knowledge_ds, auxiliary_dataset]) else: - phase10 = knowledge_ds + phase10 = concatenate_datasets([raft_knowledge_ds, pretraining_knowledge_ds]) return phase10 From cd0b73af9073ef0d1c4d3e063adc1259c0487104 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Mon, 11 Nov 2024 17:20:52 -0500 Subject: [PATCH 3/4] Update function call to include the new param - use_legacy_pretraining_format Signed-off-by: Aakanksha Duggal --- src/instructlab/sdg/datamixing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py index 63d20fbb..d9a57606 100644 --- a/src/instructlab/sdg/datamixing.py +++ b/src/instructlab/sdg/datamixing.py @@ -608,7 +608,7 @@ def collect( ) skills_phase_data = _create_phase10_ds( - new_generated_data, self.auxiliary_inst + new_generated_data, self.auxiliary_inst, use_legacy_pretraining_format ) output_file_leaf_skills = ( f"node_datasets_{self.date_suffix}/{leaf_node_path}_p10.jsonl" From 3040657b9155fd8cef9dd99e73624c23073f7a6f Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Mon, 11 Nov 2024 20:18:01 -0500 Subject: [PATCH 4/4] address mypy issues Signed-off-by: Aakanksha Duggal --- src/instructlab/sdg/datamixing.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py index d9a57606..5172fdfb 100644 --- a/src/instructlab/sdg/datamixing.py +++ b/src/instructlab/sdg/datamixing.py @@ -445,7 +445,9 @@ def __create_auxiliary_ds(rec): def _create_phase10_ds( - generated_dataset: Dataset, auxiliary_inst: Optional[Dict[str, List[str]]], use_legacy_pretraining_format: bool, + generated_dataset: Dataset, + auxiliary_inst: Optional[Dict[str, List[str]]], + use_legacy_pretraining_format: bool, ): """ Create a dataset for Phase 1.0 of downstream training. @@ -461,14 +463,14 @@ def _create_phase10_ds( # Include phase07 pretraining_knowledge_ds = _generate_knowledge_qa_dataset( generated_dataset, keep_context_separate=False - ).map( - lambda rec: _conv_pretrain(rec, use_legacy_pretraining_format) - ) + ).map(lambda rec: _conv_pretrain(rec, use_legacy_pretraining_format)) auxiliary_dataset = _create_auxiliary_dataset(generated_dataset, auxiliary_inst) if auxiliary_dataset is not None: - phase10 = concatenate_datasets([raft_knowledge_ds, pretraining_knowledge_ds, auxiliary_dataset]) + phase10 = concatenate_datasets( + [raft_knowledge_ds, pretraining_knowledge_ds, auxiliary_dataset] + ) else: phase10 = concatenate_datasets([raft_knowledge_ds, pretraining_knowledge_ds]) return phase10