|
| 1 | +import asyncio |
| 2 | +import logging |
| 3 | +from enum import Enum |
| 4 | +from pathlib import Path |
| 5 | + |
| 6 | +import dspy |
| 7 | +import hydra |
| 8 | +import neptune |
| 9 | +from dspy.evaluate import Evaluate |
| 10 | +from neptune.utils import stringify_unsupported |
| 11 | +from omegaconf import DictConfig |
| 12 | +from tuning.loaders import IQLGenerationDataLoader |
| 13 | +from tuning.metrics import filtering_assess_acc |
| 14 | +from tuning.programs import PROGRAMS |
| 15 | +from tuning.utils import save, serialize_results |
| 16 | + |
| 17 | +logging.getLogger("httpx").setLevel(logging.ERROR) |
| 18 | +logging.getLogger("anthropic").setLevel(logging.ERROR) |
| 19 | +log = logging.getLogger(__name__) |
| 20 | + |
| 21 | + |
| 22 | +class EvaluationType(Enum): |
| 23 | + """ |
| 24 | + Enum representing the evaluation type. |
| 25 | + """ |
| 26 | + |
| 27 | + FILTERING_ASSESSOR = "FILTERING_ASSESSOR" |
| 28 | + |
| 29 | + |
| 30 | +EVALUATION_DATALOADERS = { |
| 31 | + EvaluationType.FILTERING_ASSESSOR.value: IQLGenerationDataLoader, |
| 32 | +} |
| 33 | + |
| 34 | +EVALUATION_METRICS = { |
| 35 | + EvaluationType.FILTERING_ASSESSOR.value: filtering_assess_acc, |
| 36 | +} |
| 37 | + |
| 38 | + |
| 39 | +async def evaluate(config: DictConfig) -> None: |
| 40 | + """ |
| 41 | + Function running evaluation for all datasets and evaluation tasks defined in hydra config. |
| 42 | +
|
| 43 | + Args: |
| 44 | + config: Hydra configuration. |
| 45 | + """ |
| 46 | + log.info("Starting evaluation: %s", config.program.name) |
| 47 | + |
| 48 | + dataloader = EVALUATION_DATALOADERS[config.program.type](config) |
| 49 | + metric = EVALUATION_METRICS[config.program.type] |
| 50 | + program = PROGRAMS[config.program.name]() |
| 51 | + |
| 52 | + dataset = await dataloader.load() |
| 53 | + |
| 54 | + lm = dspy.__dict__[config.llm.provider](model=config.llm.model_name) |
| 55 | + dspy.settings.configure(lm=lm) |
| 56 | + |
| 57 | + evaluator = Evaluate( |
| 58 | + devset=dataset, |
| 59 | + metric=metric, |
| 60 | + num_threads=32, |
| 61 | + display_progress=True, |
| 62 | + return_outputs=True, |
| 63 | + ) |
| 64 | + metric, results = evaluator(program) |
| 65 | + |
| 66 | + log.info("Evaluation finished. Saving results...") |
| 67 | + |
| 68 | + output_dir = Path(hydra.core.hydra_config.HydraConfig.get().runtime.output_dir) |
| 69 | + results_file = output_dir / "results.json" |
| 70 | + save(results_file, results=serialize_results(results)) |
| 71 | + |
| 72 | + log.info("Evaluation results saved under directory: %s", output_dir) |
| 73 | + |
| 74 | + if config.neptune: |
| 75 | + run = neptune.init_run() |
| 76 | + run["sys/tags"].add( |
| 77 | + [ |
| 78 | + config.program.type, |
| 79 | + config.program.name, |
| 80 | + *config.data.db_ids, |
| 81 | + *config.data.difficulties, |
| 82 | + ] |
| 83 | + ) |
| 84 | + run["config"] = stringify_unsupported(config) |
| 85 | + run["evaluation/metrics/ACC"] = stringify_unsupported(metric) |
| 86 | + run["evaluation/results.json"].upload(results_file.as_posix()) |
| 87 | + |
| 88 | + |
| 89 | +@hydra.main(config_path="config", config_name="config", version_base="3.2") |
| 90 | +def main(config: DictConfig) -> None: |
| 91 | + """ |
| 92 | + Function running evaluation for all datasets and evaluation tasks defined in hydra config. |
| 93 | +
|
| 94 | + Args: |
| 95 | + config: Hydra configuration. |
| 96 | + """ |
| 97 | + asyncio.run(evaluate(config)) |
| 98 | + |
| 99 | + |
| 100 | +if __name__ == "__main__": |
| 101 | + main() # pylint: disable=no-value-for-parameter |
0 commit comments