Skip to content

Commit

Permalink
Merge pull request #554 from instructlab/mergify/bp/release-v0.7/pr-551
Browse files Browse the repository at this point in the history
fix: ensures the system prompt is set when mixing datasets from SDG (backport #551)
  • Loading branch information
bbrowning authored Feb 14, 2025
2 parents 7fff001 + 19a9732 commit 87d2da8
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 3 deletions.
8 changes: 6 additions & 2 deletions docs/examples/mix_datasets/example_mixing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,14 @@
output_dir = Path(__file__).parent.joinpath("output")
output_dir.mkdir(exist_ok=True)

system_prompt = "You are a helpful assistant."

concatenate_recipe_yaml = Path(__file__).parent.joinpath("concatenate_recipe.yaml")
concatenated_output_jsonl = output_dir.joinpath("concatenated.jsonl")
mix_datasets(concatenate_recipe_yaml, concatenated_output_jsonl)
mix_datasets(
concatenate_recipe_yaml, concatenated_output_jsonl, system_prompt=system_prompt
)

weighted_recipe_yaml = Path(__file__).parent.joinpath("weighted_recipe.yaml")
weighted_output_jsonl = output_dir.joinpath("weighted.jsonl")
mix_datasets(weighted_recipe_yaml, weighted_output_jsonl)
mix_datasets(weighted_recipe_yaml, weighted_output_jsonl, system_prompt=system_prompt)
5 changes: 4 additions & 1 deletion src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,8 +603,9 @@ def mix_datasets(
recipe_file: str,
output_file: str,
num_proc: Optional[int] = 8,
system_prompt: Optional[str] = None,
):
recipe = Recipe(recipe_file)
recipe = Recipe(recipe_file, system_prompt)
if recipe.datasets:
recipe.save_mixed_dataset(output_file, num_proc)
else:
Expand Down Expand Up @@ -719,10 +720,12 @@ def generate_data(
mix_datasets(
recipe_file=f"{output_dir}/skills_recipe_{date_suffix}.yaml",
output_file=f"{output_dir}/skills_train_msgs_{date_suffix}.jsonl",
system_prompt=system_prompt,
)
mix_datasets(
recipe_file=f"{output_dir}/knowledge_recipe_{date_suffix}.yaml",
output_file=f"{output_dir}/knowledge_train_msgs_{date_suffix}.jsonl",
system_prompt=system_prompt,
)

generate_duration = time.time() - generate_start
Expand Down

0 comments on commit 87d2da8

Please sign in to comment.