Skip to content

Commit

Permalink
returned evaluate to performance, and inference only in case that not…
Browse files Browse the repository at this point in the history
… mocked

Signed-off-by: dafnapension <[email protected]>
  • Loading branch information
dafnapension committed Feb 4, 2025
1 parent 29f5dda commit 029691c
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 15 deletions.
51 changes: 46 additions & 5 deletions performance/bluebench_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,15 @@
import pstats
import tempfile
from io import StringIO
from typing import Any, Dict, List
from typing import Any, Dict, List, Union

from unitxt.api import _source_to_dataset, load_recipe
from unitxt.api import _source_to_dataset, evaluate, load_recipe
from unitxt.benchmark import Benchmark
from unitxt.inference import (
CrossProviderInferenceEngine,
InferenceEngine,
TextGenerationInferenceOutput,
)
from unitxt.logging_utils import get_logger
from unitxt.settings_utils import get_settings

Expand Down Expand Up @@ -65,6 +70,20 @@ def profiler_generate_benchmark_dataset(
dataset = _source_to_dataset(benchmark_recipe, split=split)
return list(dataset)

def profiler_instantiate_model(self) -> InferenceEngine:
return CrossProviderInferenceEngine(
model="llama-3-8b-instruct",
max_tokens=30,
)

def profiler_infer_predictions(
self, model: InferenceEngine, dataset: List[Dict[str, Any]]
) -> Union[List[str], List[TextGenerationInferenceOutput]]:
return model.infer(dataset=dataset)

def profiler_evaluate_predictions(self, predictions, dataset) -> dict:
return evaluate(predictions=predictions, data=dataset)

def profiler_do_the_profiling(self, dataset_query: str, split: str, **kwargs):
benchmark_recipe = self.profiler_instantiate_benchmark_recipe(
dataset_query=dataset_query, **kwargs
Expand All @@ -74,7 +93,14 @@ def profiler_do_the_profiling(self, dataset_query: str, split: str, **kwargs):
benchmark_recipe=benchmark_recipe, split=split, **kwargs
)

logger.critical(f"length of bluebench generated dataset: {len(dataset)}")
model = self.profiler_instantiate_model()

predictions = self.profiler_infer_predictions(model=model, dataset=dataset)

evaluation_result = self.profiler_evaluate_predictions(
predictions=predictions, dataset=dataset
)
logger.critical(f"length of evaluation_result: {len(evaluation_result)}")


dataset_query = "benchmarks.bluebench[loader_limit=30,max_samples_per_subset=30]"
Expand Down Expand Up @@ -128,29 +154,44 @@ def main():
pst.strip_dirs()
pst.sort_stats("name") # sort by function name
pst.print_stats(
"profile_benchmark_blue_bench|profiler_instantiate_benchmark_recipe|profiler_generate_benchmark_dataset|load_data|load_iterables"
"profile_benchmark_blue_bench|profiler_instantiate_benchmark_recipe|profiler_generate_benchmark_dataset|profiler_instantiate_model|profiler_infer_predictions|profiler_evaluate_predictions|load_data|load_iterables"
)
s = f.getvalue()
assert s.split("\n")[7].split()[3] == "cumtime"
overall_tot_time = find_cummtime_of(
"profile_benchmark_blue_bench", "bluebench_profiler.py", s
)
load_time = find_cummtime_of("load_data", "loaders.py", s)

just_load_no_initial_ms_time = find_cummtime_of(
"load_iterables", "loaders.py", s
)
instantiate_benchmark_time = find_cummtime_of(
"profiler_instantiate_benchmark_recipe", "bluebench_profiler.py", s
)
generate_benchmark_dataset_time = find_cummtime_of(
"profiler_generate_benchmark_dataset", "bluebench_profiler.py", s
)
instantiate_model_time = find_cummtime_of(
"profiler_instantiate_model", "bluebench_profiler.py", s
)
inference_time = find_cummtime_of(
"profiler_infer_predictions", "bluebench_profiler.py", s
)
evaluation_time = find_cummtime_of(
"profiler_evaluate_predictions", "bluebench_profiler.py", s
)

# Data to be written
dictionary = {
"dataset_query": dataset_query,
"total_time": overall_tot_time,
"load_time": load_time,
"load_time_no_initial_ms": just_load_no_initial_ms_time,
"instantiate_benchmark_time": instantiate_benchmark_time,
"generate_benchmark_dataset_time": generate_benchmark_dataset_time,
"instantiate_model_time": instantiate_model_time,
"inference_time": inference_time,
"evaluation_time": evaluation_time,
"used_eager_mode": settings.use_eager_execution,
"performance.prof file": temp_prof_file_path,
}
Expand Down
49 changes: 39 additions & 10 deletions performance/compare_benchmark_performance_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
import json
import sys

from unitxt.settings_utils import get_settings

settings = get_settings()

# Argument parser to get file paths from the command line
parser = argparse.ArgumentParser(description="Compare performance profiles.")
parser.add_argument(
Expand All @@ -22,11 +26,24 @@
print(f'dataset_query = "{main_perf["dataset_query"]}"')
print(f"used_eager_mode in main = {main_perf['used_eager_mode']}")
print(f"used_eager_mode in PR = {pr_perf['used_eager_mode']}")
print(f"use Mocked inference = {settings.mock_inference_mode}")

ratio1 = (
(pr_perf["generate_benchmark_dataset_time"] - pr_perf["load_time"])
/ (main_perf["generate_benchmark_dataset_time"] - main_perf["load_time"])
if (main_perf["generate_benchmark_dataset_time"] - main_perf["load_time"]) > 0
(pr_perf["generate_benchmark_dataset_time"] - pr_perf["load_time_no_initial_ms"])
/ (
main_perf["generate_benchmark_dataset_time"]
- main_perf["load_time_no_initial_ms"]
)
if (
main_perf["generate_benchmark_dataset_time"]
- main_perf["load_time_no_initial_ms"]
)
> 0
else 1
)
ratio2 = (
pr_perf["evaluation_time"] / main_perf["evaluation_time"]
if main_perf["evaluation_time"] > 0
else 1
)
# Markdown table formatting
Expand All @@ -35,19 +52,31 @@
line2 = "--------------------|-------------|-------------|---------------\n"
line3 = f" Total time | {main_perf['total_time']:>11} | {pr_perf['total_time']:>11} | {pr_perf['total_time'] / main_perf['total_time']:.2f}\n"
ratio_line4 = (
pr_perf["load_time"] / main_perf["load_time"] if main_perf["load_time"] > 0 else 1
pr_perf["load_time_no_initial_ms"] / main_perf["load_time_no_initial_ms"]
if main_perf["load_time_no_initial_ms"] > 0
else 1
)
line4 = f" Load time | {main_perf['load_time']:>11} | {pr_perf['load_time']:>11} | {ratio_line4:.2f}\n"
line4 = f" Load time | {main_perf['load_time_no_initial_ms']:>11} | {pr_perf['load_time_no_initial_ms']:>11} | {ratio_line4:.2f}\n"
line5 = f" DS Gen. inc. Load | {main_perf['generate_benchmark_dataset_time']:>11} | {pr_perf['generate_benchmark_dataset_time']:>11} | {pr_perf['generate_benchmark_dataset_time'] / main_perf['generate_benchmark_dataset_time']:.2f}\n"
line6 = f" DS Gen. exc. Load | {round(main_perf['generate_benchmark_dataset_time'] - main_perf['load_time'], 3):>11} | {round(pr_perf['generate_benchmark_dataset_time'] - pr_perf['load_time'], 3):>11} | {ratio1:.2f}\n"
line7 = f" Benchmark Instant. | {main_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time'] / main_perf['instantiate_benchmark_time']:.2f}\n"
line6 = f" DS Gen. exc. Load | {round(main_perf['generate_benchmark_dataset_time'] - main_perf['load_time_no_initial_ms'], 3):>11} | {round(pr_perf['generate_benchmark_dataset_time'] - pr_perf['load_time_no_initial_ms'], 3):>11} | {ratio1:.2f}\n"
line7 = f" Inference time | {main_perf['inference_time']:>11} | {pr_perf['inference_time']:>11} | {pr_perf['inference_time'] / main_perf['inference_time']:.2f}\n"
line8 = f" Evaluate time | {main_perf['evaluation_time']:>11} | {pr_perf['evaluation_time']:>11} | {ratio2:.2f}\n"
line9 = f" Benchmark Instant. | {main_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time'] / main_perf['instantiate_benchmark_time']:.2f}\n"
line10 = f" Model Instantiation| {main_perf['instantiate_model_time']:>11} | {pr_perf['instantiate_model_time']:>11} | {pr_perf['instantiate_model_time'] / main_perf['instantiate_model_time']:.2f}\n"

print("### Performance Comparison Results, time expressed in seconds:\n")
print(line1 + line2 + line3 + line4 + line5 + line6 + line7)
if not settings.mock_inference_mode:
print(
line1 + line2 + line3 + line4 + line5 + line6 + line7 + line8 + line9 + line10
)
else:
print(line1 + line2 + line3 + line4 + line5 + line6 + line8 + line9 + line10)
print("\n\n")
# Performance degradation check (5% threshold)
if ratio1 > 1.05:
print("\n**Warning**: Performance degradation in Dataset Generation exceeds 5%!")
if ratio1 > 1.05 or ratio2 > 1.05:
print(
"\n**Warning**: Performance degradation in Dataset Generation and/or Evaluation exceeds 5%!"
)
print(
"Explore branch performance via 'python performance/bluebench_profiler.py --output_file=<path to json file>',"
"followed by 'snakeviz <the performance.prof file specified in the output json file>'."
Expand Down

0 comments on commit 029691c

Please sign in to comment.