Skip to content

Commit

Permalink
Adjust backport to remove use_legacy_pretraining_format
Browse files Browse the repository at this point in the history
Older versions of SDG did not have the `use_legacy_pretraining_format`
parameter, so this just removes that from the backported code.

Signed-off-by: Ben Browning <[email protected]>
  • Loading branch information
bbrowning committed Nov 12, 2024
1 parent f7a840e commit 625d9ab
Showing 1 changed file with 2 additions and 3 deletions.
5 changes: 2 additions & 3 deletions src/instructlab/sdg/datamixing.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,6 @@ def __create_auxiliary_ds(rec):
def _create_phase10_ds(
generated_dataset: Dataset,
auxiliary_inst: Optional[Dict[str, List[str]]],
use_legacy_pretraining_format: bool,
):
"""
Create a dataset for Phase 1.0 of downstream training.
Expand All @@ -456,7 +455,7 @@ def _create_phase10_ds(
# Include phase07
pretraining_knowledge_ds = _generate_knowledge_qa_dataset(
generated_dataset, keep_context_separate=False
).map(lambda rec: _conv_pretrain(rec, use_legacy_pretraining_format))
).map(_conv_pretrain)

auxiliary_dataset = _create_auxiliary_dataset(generated_dataset, auxiliary_inst)

Expand Down Expand Up @@ -591,7 +590,7 @@ def collect(self, leaf_node_path, new_generated_data, is_knowledge):
)

skills_phase_data = _create_phase10_ds(
new_generated_data, self.auxiliary_inst, use_legacy_pretraining_format
new_generated_data, self.auxiliary_inst
)
output_file_leaf_skills = (
f"node_datasets_{self.date_suffix}/{leaf_node_path}_p10.jsonl"
Expand Down

0 comments on commit 625d9ab

Please sign in to comment.